more task names

docker
add task names
2026-06-30 18:50:36 +00:00 · 2024-04-19 16:04:34 +01:00 · 2024-04-19 15:31:23 +01:00 · 2024-04-19 15:11:43 +01:00 · 2024-04-19 15:02:52 +01:00 · 2024-04-19 14:54:22 +01:00
278 changed files with 6692 additions and 17049 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -2,6 +2,7 @@
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
 rustdocflags = ["-Arustdoc::private_intra_doc_links"]
+rustflags = ["--cfg", "tokio_unstable"]

 [alias]
 build_testing = ["build", "--features", "testing"]
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -214,6 +214,7 @@ jobs:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
+      RUSTFLAGS: "--cfg=tokio_unstable"

    steps:
      - name: Fix git ownership
@@ -236,6 +237,27 @@ jobs:
          submodules: true
          fetch-depth: 1

+      - name: Check Postgres submodules revision
+        shell: bash -euo pipefail {0}
+        run: |
+          # This is a temporary solution to ensure that the Postgres submodules revision is correct (i.e. the updated intentionally).
+          # Eventually it will be replaced by a regression test https://github.com/neondatabase/neon/pull/4603
+
+          FAILED=false
+          for postgres in postgres-v14 postgres-v15 postgres-v16; do
+            expected=$(cat vendor/revisions.json | jq --raw-output '."'"${postgres}"'"')
+            actual=$(git rev-parse "HEAD:vendor/${postgres}")
+            if [ "${expected}" != "${actual}" ]; then
+              echo >&2 "Expected ${postgres} rev to be at '${expected}', but it is at '${actual}'"
+              FAILED=true
+            fi
+          done
+
+          if [ "${FAILED}" = "true" ]; then
+            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
+            exit 1
+          fi
+
      - name: Set pg 14 revision for caching
        id: pg_v14_rev
        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
@@ -456,8 +478,6 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -537,9 +557,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -719,7 +736,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3

      - uses: docker/login-action@v3
        with:
@@ -776,7 +793,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -849,7 +866,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.23.2

    steps:
      - name: Checkout
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,21 +45,21 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "1.3", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "1.26"
+aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.14"
 aws-sdk-iam = "1.15.0"
-aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
-aws-smithy-types = "1.1.9"
-aws-credential-types = "1.2.0"
-aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
-aws-types = "1.2.0"
+aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.1.4"
+aws-credential-types = "1.1.4"
+aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
+aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -71,7 +71,7 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-comfy-table = "7.1.1"
+comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
@@ -86,8 +86,8 @@ futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
-hashbrown = "0.14"
-hashlink = "0.9"
+hashbrown = "0.13"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
@@ -99,7 +99,6 @@ humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.13.0"
-indexmap = "2"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -110,33 +109,33 @@ libc = "0.2"
 md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
-memoffset = "0.9.1"
+memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.22.0"
-opentelemetry_sdk = "0.22.0"
-opentelemetry-otlp = { version = "0.15.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.14.0"
+opentelemetry = "0.20.0"
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.12"
+prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_22"] }
-reqwest-middleware = "0.3.0"
-reqwest-retry = "0.5"
+reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
+reqwest-middleware = "0.2.0"
+reqwest-retry = "0.2.2"
 routerify = "3"
-rpds = "1.1"
+rpds = "0.13"
 rustc-hash = "1.1.0"
 rustls = "0.22"
 rustls-pemfile = "2"
@@ -144,24 +143,24 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
-serde_with = "3.0"
-serde_assert = "0.7.0"
+serde_with = "2.0"
+serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
-strum = "0.26"
-strum_macros = "0.26"
+strum = "0.24"
+strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
+task-local-extensions = "0.1.4"
 test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
@@ -176,17 +175,17 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
-tonic = {version = "0.11", features = ["tls", "tls-roots"]}
-tower-service = "0.3.2"
+tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.23.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-opentelemetry = "0.20.0"
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
+webpki-roots = "0.25"
 x509-parser = "0.15"

 ## TODO replace this with tracing
@@ -230,8 +229,9 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 ## Build dependencies
 criterion = "0.5.1"
 rcgen = "0.12"
+rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.11"
+tonic-build = "0.9"

 [patch.crates-io]

--- a/2
+++ b/2
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment --cfg=tokio_unstable" cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -65,7 +65,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
    && mv s5cmd /usr/local/bin/s5cmd

 # LLVM
-ENV LLVM_VERSION=18
+ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.78.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/29
+++ b/29
@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	ifndef DISABLE_HOMEBREW
-		# macOS with brew-installed openssl requires explicit paths
-		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
-	endif
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
@@ -81,14 +79,11 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
 		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
-
-	VERSION=$*; \
-	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
-	(cd $(POSTGRES_INSTALL_DIR)/build/$$VERSION && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
+	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
 		CFLAGS='$(PG_CFLAGS)' \
-		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
+		$(PG_CONFIGURE_OPTS) \
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$* > configure.log)

 # nicer alias to run 'configure'
 # Note: I've been unable to use templates for this part of our configuration.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
-opentelemetry_sdk.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
@@ -39,9 +38,7 @@ utils.workspace = true
 workspace_hack.workspace = true
 toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
-
-[target.'cfg(target_os = "linux")'.dependencies]
-vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,11 +47,10 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;
-use compute_api::spec::ComputeSpec;

 use compute_tools::compute::{
    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
@@ -63,41 +62,12 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use compute_tools::swap::resize_swap;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
 const BUILD_TAG_DEFAULT: &str = "latest";

 fn main() -> Result<()> {
-    let (build_tag, clap_args) = init()?;
-
-    let (pg_handle, start_pg_result) = {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
-
-        let cli_args = process_cli(&clap_args)?;
-
-        let cli_spec = try_spec_from_cli(&clap_args, &cli_args)?;
-
-        let wait_spec_result = wait_spec(build_tag, cli_args, cli_spec)?;
-
-        start_postgres(&clap_args, wait_spec_result)?
-
-        // Startup is finished, exit the startup tracing span
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
-
-    maybe_delay_exit(delay_exit);
-
-    deinit_and_exit(wait_pg_result);
-}
-
-fn init() -> Result<(String, clap::ArgMatches)> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -112,15 +82,9 @@ fn init() -> Result<(String, clap::ArgMatches)> {
        .to_string();
    info!("build_tag: {build_tag}");

-    Ok((build_tag, cli().get_matches()))
-}
-
-fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
-    let pgbin_default = "postgres";
-    let pgbin = matches
-        .get_one::<String>("pgbin")
-        .map(|s| s.as_str())
-        .unwrap_or(pgbin_default);
+    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);

    let ext_remote_storage = matches
        .get_one::<String>("remote-ext-config")
@@ -146,32 +110,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
        .expect("Postgres connection string is required");
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");

-    Ok(ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        http_port,
-        spec_json,
-        spec_path,
-        resize_swap_on_bind,
-    })
-}
-
-struct ProcessCliResult<'clap> {
-    connstr: &'clap str,
-    pgdata: &'clap str,
-    pgbin: &'clap str,
-    ext_remote_storage: Option<&'clap str>,
-    http_port: u16,
-    spec_json: Option<&'clap String>,
-    spec_path: Option<&'clap String>,
-    resize_swap_on_bind: bool,
-}
-
-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -208,9 +147,9 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
    if let Ok(val) = std::env::var("TRACESTATE") {
        startup_tracing_carrier.insert("tracestate".to_string(), val);
    }
-    if !startup_tracing_carrier.is_empty() {
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry_sdk::propagation::TraceContextPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
        let guard = TraceContextPropagator::new()
            .extract(&startup_tracing_carrier)
            .attach();
@@ -218,17 +157,8 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
        Some(guard)
    } else {
        None
-    }
-}
+    };

-fn try_spec_from_cli(
-    matches: &clap::ArgMatches,
-    ProcessCliResult {
-        spec_json,
-        spec_path,
-        ..
-    }: &ProcessCliResult,
-) -> Result<CliSpecParams> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

@@ -269,34 +199,6 @@ fn try_spec_from_cli(
        }
    };

-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
-}
-
-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    live_config_allowed: bool,
-}
-
-fn wait_spec(
-    build_tag: String,
-    ProcessCliResult {
-        connstr,
-        pgdata,
-        pgbin,
-        ext_remote_storage,
-        resize_swap_on_bind,
-        http_port,
-        ..
-    }: ProcessCliResult,
-    CliSpecParams {
-        spec,
-        live_config_allowed,
-    }: CliSpecParams,
-) -> Result<WaitSpecResult> {
    let mut new_state = ComputeState::new();
    let spec_set;

@@ -324,17 +226,19 @@ fn wait_spec(

    // If this is a pooled VM, prewarm before starting HTTP server and becoming
    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have its memory allocated from the host, and
+    // because QEMU will already have it's memory allocated from the host, and
    // the necessary binaries will already be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }

-    // Launch http service first, so that we can serve control-plane requests
-    // while configuration is still in progress.
+    // Launch http service first, so we were able to serve control-plane
+    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
@@ -349,45 +253,21 @@ fn wait_spec(
                break;
            }
        }
-
-        // Record for how long we slept waiting for the spec.
-        let now = Utc::now();
-        state.metrics.wait_for_spec_ms = now
-            .signed_duration_since(state.start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // Reset start time, so that the total startup time that is calculated later will
-        // not include the time that we waited for the spec.
-        state.start_time = now;
    }

-    Ok(WaitSpecResult {
-        compute,
-        http_port,
-        resize_swap_on_bind,
-    })
-}
-
-struct WaitSpecResult {
-    compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
-    resize_swap_on_bind: bool,
-}
-
-fn start_postgres(
-    // need to allow unused because `matches` is only used if target_os = "linux"
-    #[allow(unused_variables)] matches: &clap::ArgMatches,
-    WaitSpecResult {
-        compute,
-        http_port,
-        resize_swap_on_bind,
-    }: WaitSpecResult,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();

@@ -395,72 +275,33 @@ fn start_postgres(
        "running compute with features: {:?}",
        state.pspec.as_ref().unwrap().spec.features
    );
-    // before we release the mutex, fetch the swap size (if any) for later.
-    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
    drop(state);

    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute);
    let _configurator_handle = launch_configurator(&compute);

-    let mut prestartup_failed = false;
-    let mut delay_exit = false;
-
-    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
-        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-        // *before* starting postgres.
-        //
-        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-        // OOM-killed during startup because swap wasn't available yet.
-        match resize_swap(size_bytes) {
-            Ok(()) => {
-                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_gib, "resized swap");
-            }
-            Err(err) => {
-                let err = err.context("failed to resize swap");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{err:?}"));
-                state.status = ComputeStatus::Failed;
-                compute.state_changed.notify_all();
-                delay_exit = true;
-            }
-        }
-    }
-
-    let extension_server_port: u16 = http_port;
-
    // Start Postgres
-    let mut pg = None;
-    if !prestartup_failed {
-        pg = match compute.start_compute(extension_server_port) {
-            Ok(pg) => Some(pg),
-            Err(err) => {
-                error!("could not start the compute node: {:#}", err);
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{:?}", err));
-                state.status = ComputeStatus::Failed;
-                // Notify others that Postgres failed to start. In case of configuring the
-                // empty compute, it's likely that API handler is still waiting for compute
-                // state change. With this we will notify it that compute is in Failed state,
-                // so control plane will know about it earlier and record proper error instead
-                // of timeout.
-                compute.state_changed.notify_all();
-                drop(state); // unlock
-                delay_exit = true;
-                None
-            }
-        };
-    } else {
-        warn!("skipping postgres startup because pre-startup step failed");
-    }
+    let mut delay_exit = false;
+    let mut exit_code = None;
+    let pg = match compute.start_compute(extension_server_port) {
+        Ok(pg) => Some(pg),
+        Err(err) => {
+            error!("could not start the compute node: {:#}", err);
+            let mut state = compute.state.lock().unwrap();
+            state.error = Some(format!("{:?}", err));
+            state.status = ComputeStatus::Failed;
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
+            delay_exit = true;
+            None
+        }
+    };

    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
    // because it requires cgroups.
@@ -493,7 +334,7 @@ fn start_postgres(
            // This token is used internally by the monitor to clean up all threads
            let token = CancellationToken::new();

-            let vm_monitor = rt.as_ref().map(|rt| {
+            let vm_monitor = &rt.as_ref().map(|rt| {
                rt.spawn(vm_monitor::start(
                    Box::leak(Box::new(vm_monitor::Args {
                        cgroup: cgroup.cloned(),
@@ -506,41 +347,12 @@ fn start_postgres(
        }
    }

-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            rt,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, std::thread::JoinHandle<()>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    rt: Option<tokio::runtime::Runtime>,
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
    if let Some((mut pg, logs_handle)) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
        let ecode = pg
            .wait()
            .expect("failed to start waiting on Postgres process");
@@ -555,25 +367,6 @@ fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
        exit_code = ecode.code()
    }

-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_after_postgres_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-        #[cfg(target_os = "linux")]
-        rt,
-    }: StartPostgresResult,
-) -> Result<bool> {
    // Terminate the vm_monitor so it releases the file watcher on
    // /sys/fs/cgroup/neon-postgres.
    // Note: the vm-monitor only runs on linux because it requires cgroups.
@@ -615,19 +408,13 @@ fn cleanup_after_postgres_exit(
        error!("error while checking for core dumps: {err:?}");
    }

-    Ok(delay_exit)
-}
-
-fn maybe_delay_exit(delay_exit: bool) {
    // If launch failed, keep serving HTTP requests for a while, so the cloud
    // control plane can get the actual error.
    if delay_exit {
        info!("giving control plane 30s to collect the error before shutdown");
        thread::sleep(Duration::from_secs(30));
    }
-}

-fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
    // Shutdown trace pipeline gracefully, so that it has a chance to send any
    // pending traces before we exit. Shutting down OTEL tracing provider may
    // hang for quite some time, see, for example:
@@ -739,11 +526,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,5 +14,4 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod swap;
 pub mod sync_sk;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    if existing_dbs.contains_key(&op.name) {
+                    if existing_dbs.get(&op.name).is_some() {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,36 +0,0 @@
-use anyhow::{anyhow, Context};
-use tracing::warn;
-
-pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
-
-pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
-    // run `/neonvm/bin/resize-swap --once {size_bytes}`
-    //
-    // Passing '--once' causes resize-swap to delete itself after successful completion, which
-    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
-    // postgres is running.
-    //
-    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
-    let child_result = std::process::Command::new("/usr/bin/sudo")
-        .arg(RESIZE_SWAP_BIN)
-        .arg("--once")
-        .arg(size_bytes.to_string())
-        .spawn();
-
-    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
-        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-        return Ok(());
-    }
-
-    child_result
-        .context("spawn() failed")
-        .and_then(|mut child| child.wait().context("wait() failed"))
-        .and_then(|status| match status.success() {
-            true => Ok(()),
-            false => Err(anyhow!("process exited with {status}")),
-        })
-        // wrap any prior error with the overall context that we couldn't run the command
-        .with_context(|| {
-            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
-        })
-}
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
@@ -25,9 +24,9 @@ scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
+tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
-toml_edit.workspace = true
 tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -9,23 +9,20 @@ use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::{
-    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    SafekeeperConf,
-};
-use control_plane::pageserver::PageServerNode;
+use control_plane::local_env::{InitForceMode, LocalEnv};
+use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::config::{
-    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
-};
 use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use pageserver_api::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
@@ -55,6 +52,44 @@ const DEFAULT_PG_VERSION: &str = "15";

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

+fn default_conf(num_pageservers: u16) -> String {
+    let mut template = format!(
+        r#"
+# Default built-in configuration, defined in main.rs
+control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
+
+[broker]
+listen_addr = '{DEFAULT_BROKER_ADDR}'
+
+[[safekeepers]]
+id = {DEFAULT_SAFEKEEPER_ID}
+pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
+http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
+
+"#,
+    );
+
+    for i in 0..num_pageservers {
+        let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
+        let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
+        let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+
+        template += &format!(
+            r#"
+[[pageservers]]
+id = {pageserver_id}
+listen_pg_addr = '127.0.0.1:{pg_port}'
+listen_http_addr = '127.0.0.1:{http_port}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'
+"#,
+            trust_auth = AuthType::Trust,
+        )
+    }
+
+    template
+}
+
 ///
 /// Timelines tree element used as a value in the HashMap.
 ///
@@ -98,7 +133,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env)),
+            "start" => rt.block_on(handle_start_all(sub_args, &env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -117,7 +152,7 @@ fn main() -> Result<()> {
    };

    match subcommand_result {
-        Ok(Some(updated_env)) => updated_env.persist_config()?,
+        Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?,
        Ok(None) => (),
        Err(e) => {
            eprintln!("command failed: {e:?}");
@@ -306,65 +341,48 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
 }

 fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
-    let num_pageservers = init_match.get_one::<u16>("num-pageservers");
-
-    let force = init_match.get_one("force").expect("we set a default value");
-
-    // Create the in-memory `LocalEnv` that we'd normally load from disk in `load_config`.
-    let init_conf: NeonLocalInitConf = if let Some(config_path) =
-        init_match.get_one::<PathBuf>("config")
-    {
-        // User (likely the Python test suite) provided a description of the environment.
-        if num_pageservers.is_some() {
-            bail!("Cannot specify both --num-pageservers and --config, use key `pageservers` in the --config file instead");
-        }
+    let num_pageservers = init_match
+        .get_one::<u16>("num-pageservers")
+        .expect("num-pageservers arg has a default");
+    // Create config file
+    let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
        // load and parse the file
-        let contents = std::fs::read_to_string(config_path).with_context(|| {
+        std::fs::read_to_string(config_path).with_context(|| {
            format!(
                "Could not read configuration file '{}'",
                config_path.display()
            )
-        })?;
-        toml_edit::de::from_str(&contents)?
+        })?
    } else {
-        // User (likely interactive) did not provide a description of the environment, give them the default
-        NeonLocalInitConf {
-            control_plane_api: Some(Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap())),
-            broker: NeonBroker {
-                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
-            },
-            safekeepers: vec![SafekeeperConf {
-                id: DEFAULT_SAFEKEEPER_ID,
-                pg_port: DEFAULT_SAFEKEEPER_PG_PORT,
-                http_port: DEFAULT_SAFEKEEPER_HTTP_PORT,
-                ..Default::default()
-            }],
-            pageservers: (0..num_pageservers.copied().unwrap_or(1))
-                .map(|i| {
-                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
-                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
-                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
-                    NeonLocalInitPageserverConf {
-                        id: pageserver_id,
-                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
-                        listen_http_addr: format!("127.0.0.1:{http_port}"),
-                        pg_auth_type: AuthType::Trust,
-                        http_auth_type: AuthType::Trust,
-                        other: Default::default(),
-                    }
-                })
-                .collect(),
-            pg_distrib_dir: None,
-            neon_distrib_dir: None,
-            default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
-            storage_controller: None,
-            control_plane_compute_hook_api: None,
-        }
+        // Built-in default config
+        default_conf(*num_pageservers)
    };

-    LocalEnv::init(init_conf, force)
-        .context("materialize initial neon_local environment on disk")?;
-    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
+    let pg_version = init_match
+        .get_one::<u32>("pg-version")
+        .copied()
+        .context("Failed to parse postgres version from the argument string")?;
+
+    let mut env =
+        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
+    let force = init_match.get_one("force").expect("we set a default value");
+    env.init(pg_version, force)
+        .context("Failed to initialize neon repository")?;
+
+    // Create remote storage location for default LocalFs remote storage
+    std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+
+    // Initialize pageserver, create initial tenant and timeline.
+    for ps_conf in &env.pageservers {
+        PageServerNode::from_env(&env, ps_conf)
+            .initialize(&pageserver_config_overrides(init_match))
+            .unwrap_or_else(|e| {
+                eprintln!("pageserver init failed: {e:?}");
+                exit(1);
+            });
+    }
+
+    Ok(env)
 }

 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -379,6 +397,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
    PageServerNode::from_env(env, ps_conf)
 }

+fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
+    init_match
+        .get_many::<String>("pageserver-config-override")
+        .into_iter()
+        .flatten()
+        .map(String::as_str)
+        .collect()
+}
+
 async fn handle_tenant(
    tenant_match: &ArgMatches,
    env: &mut local_env::LocalEnv,
@@ -390,54 +417,6 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
-        Some(("import", import_match)) => {
-            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
-
-            let storage_controller = StorageController::from_env(env);
-            let create_response = storage_controller.tenant_import(tenant_id).await?;
-
-            let shard_zero = create_response
-                .shards
-                .first()
-                .expect("Import response omitted shards");
-
-            let attached_pageserver_id = shard_zero.node_id;
-            let pageserver =
-                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
-
-            println!(
-                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
-            );
-
-            let timelines = pageserver
-                .http_client
-                .list_timelines(shard_zero.shard_id)
-                .await?;
-
-            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
-            let main_timeline = timelines
-                .iter()
-                .find(|t| t.ancestor_timeline_id.is_none())
-                .expect("No timelines found")
-                .timeline_id;
-
-            let mut branch_i = 0;
-            for timeline in timelines.iter() {
-                let branch_name = if timeline.timeline_id == main_timeline {
-                    "main".to_string()
-                } else {
-                    branch_i += 1;
-                    format!("branch_{branch_i}")
-                };
-
-                println!(
-                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
-                    timeline.timeline_id
-                );
-
-                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
-            }
-        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -810,8 +789,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .copied()
                .unwrap_or(false);

-            let allow_multiple = sub_args.get_flag("allow-multiple");
-
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -829,9 +806,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                _ => {}
            }

-            if !allow_multiple {
-                cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
-            }
+            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;

            cplane.new_endpoint(
                &endpoint_id,
@@ -860,8 +835,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

-            let allow_multiple = sub_args.get_flag("allow-multiple");
-
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -887,13 +860,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .cloned()
                .unwrap_or_default();

-            if !allow_multiple {
-                cplane.check_conflicting_endpoints(
-                    endpoint.mode,
-                    endpoint.tenant_id,
-                    endpoint.timeline_id,
-                )?;
-            }
+            cplane.check_conflicting_endpoints(
+                endpoint.mode,
+                endpoint.tenant_id,
+                endpoint.timeline_id,
+            )?;

            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
@@ -1049,7 +1020,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1075,7 +1049,10 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver.start().await {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1202,7 +1179,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

    broker::start_broker_process(env).await?;
@@ -1219,7 +1196,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start().await {
+        if let Err(e) = pageserver
+            .start(&pageserver_config_overrides(sub_match))
+            .await
+        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1360,6 +1340,13 @@ fn cli() -> Command {
        .required(false)
        .value_name("stop-mode");

+    let pageserver_config_args = Arg::new("pageserver-config-override")
+        .long("pageserver-config-override")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
+        .required(false);
+
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
@@ -1393,7 +1380,9 @@ fn cli() -> Command {
    let num_pageservers_arg = Arg::new("num-pageservers")
        .value_parser(value_parser!(u16))
        .long("num-pageservers")
-        .help("How many pageservers to create (default 1)");
+        .help("How many pageservers to create (default 1)")
+        .required(false)
+        .default_value("1");

    let update_catalog = Arg::new("update-catalog")
        .value_parser(value_parser!(bool))
@@ -1407,25 +1396,20 @@ fn cli() -> Command {
        .help("If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`")
        .required(false);

-    let allow_multiple = Arg::new("allow-multiple")
-        .help("Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests.")
-        .long("allow-multiple")
-        .action(ArgAction::SetTrue)
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
        .subcommand(
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
+                .arg(pageserver_config_args.clone())
                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
                        .required(false)
                        .value_parser(value_parser!(PathBuf))
-                        .value_name("config")
+                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
                .arg(force_arg)
@@ -1496,8 +1480,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1507,6 +1489,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1514,14 +1497,15 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop storage controller")
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
@@ -1567,7 +1551,6 @@ fn cli() -> Command {
                    .arg(pg_version_arg.clone())
                    .arg(hot_standby_arg.clone())
                    .arg(update_catalog)
-                    .arg(allow_multiple.clone())
                )
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
@@ -1576,7 +1559,6 @@ fn cli() -> Command {
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                    .arg(create_test_user)
-                    .arg(allow_multiple.clone())
                )
                .subcommand(Command::new("reconfigure")
                            .about("Reconfigure the endpoint")
@@ -1628,6 +1610,7 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
+                .arg(pageserver_config_args)
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,7 +554,6 @@ impl Endpoint {
            format_version: 1.0,
            operation_uuid: None,
            features: self.features.clone(),
-            swap_size_bytes: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -3,7 +3,7 @@
 //! Now it also provides init method which acts like a stub for proper installation
 //! script which will use local paths.

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};

 use clap::ValueEnum;
 use postgres_backend::AuthType;
@@ -17,14 +17,11 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
 };

-use crate::pageserver::PageServerNode;
-use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
 use crate::safekeeper::SafekeeperNode;

 pub const DEFAULT_PG_VERSION: u32 = 15;
@@ -36,7 +33,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 // to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
-#[derive(PartialEq, Eq, Clone, Debug)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
    // compute endpoints).
@@ -44,99 +41,55 @@ pub struct LocalEnv {
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
    // '.neon' if not given.
+    #[serde(skip)]
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
    // "lib", "share" from postgres distribution are there. If at some point
    // in time we will be able to run against vanilla postgres we may split that
    // to four separate paths and match OS-specific installation layout.
+    #[serde(default)]
    pub pg_distrib_dir: PathBuf,

    // Path to pageserver binary.
+    #[serde(default)]
    pub neon_distrib_dir: PathBuf,

    // Default tenant ID to use with the 'neon_local' command line utility, when
    // --tenant_id is not explicitly specified.
+    #[serde(default)]
    pub default_tenant_id: Option<TenantId>,

    // used to issue tokens during e.g pg start
+    #[serde(default)]
    pub private_key_path: PathBuf,

    pub broker: NeonBroker,

-    // Configuration for the storage controller (1 per neon_local environment)
-    pub storage_controller: NeonStorageControllerConf,
-
    /// This Vec must always contain at least one pageserver
-    /// Populdated by [`Self::load_config`] from the individual `pageserver.toml`s.
-    /// NB: not used anymore except for informing users that they need to change their `.neon/config`.
    pub pageservers: Vec<PageServerConf>,

+    #[serde(default)]
    pub safekeepers: Vec<SafekeeperConf>,

    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
+    #[serde(default)]
    pub control_plane_api: Option<Url>,

    // Control plane upcall API for storage controller.  If set, this will be propagated into the
    // storage controller's configuration.
+    #[serde(default)]
    pub control_plane_compute_hook_api: Option<Url>,

    /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user.
+    #[serde(default)]
    // A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
    // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
    // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
-    pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
-}
-
-/// On-disk state stored in `.neon/config`.
-#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
-#[serde(default, deny_unknown_fields)]
-pub struct OnDiskConfig {
-    pub pg_distrib_dir: PathBuf,
-    pub neon_distrib_dir: PathBuf,
-    pub default_tenant_id: Option<TenantId>,
-    pub private_key_path: PathBuf,
-    pub broker: NeonBroker,
-    pub storage_controller: NeonStorageControllerConf,
-    #[serde(
-        skip_serializing,
-        deserialize_with = "fail_if_pageservers_field_specified"
-    )]
-    pub pageservers: Vec<PageServerConf>,
-    pub safekeepers: Vec<SafekeeperConf>,
-    pub control_plane_api: Option<Url>,
-    pub control_plane_compute_hook_api: Option<Url>,
    branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
 }

-fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
-where
-    D: serde::Deserializer<'de>,
-{
-    Err(serde::de::Error::custom(
-        "The 'pageservers' field is no longer used; pageserver.toml is now authoritative; \
-         Please remove the `pageservers` from your .neon/config.",
-    ))
-}
-
-/// The description of the neon_local env to be initialized by `neon_local init --config`.
-#[derive(Clone, Debug, Deserialize)]
-#[serde(deny_unknown_fields)]
-pub struct NeonLocalInitConf {
-    // TODO: do we need this? Seems unused
-    pub pg_distrib_dir: Option<PathBuf>,
-    // TODO: do we need this? Seems unused
-    pub neon_distrib_dir: Option<PathBuf>,
-    pub default_tenant_id: TenantId,
-    pub broker: NeonBroker,
-    pub storage_controller: Option<NeonStorageControllerConf>,
-    pub pageservers: Vec<NeonLocalInitPageserverConf>,
-    pub safekeepers: Vec<SafekeeperConf>,
-    pub control_plane_api: Option<Option<Url>>,
-    pub control_plane_compute_hook_api: Option<Option<Url>>,
-}
-
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -145,29 +98,6 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

-/// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
-pub struct NeonStorageControllerConf {
-    /// Heartbeat timeout before marking a node offline
-    #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
-}
-
-impl NeonStorageControllerConf {
-    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
-}
-
-impl Default for NeonStorageControllerConf {
-    fn default() -> Self {
-        Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-        }
-    }
-}
-
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
@@ -183,18 +113,22 @@ impl NeonBroker {
    }
 }

-// neon_local needs to know this subset of pageserver configuration.
-// For legacy reasons, this information is duplicated from `pageserver.toml` into `.neon/config`.
-// It can get stale if `pageserver.toml` is changed.
-// TODO(christian): don't store this at all in `.neon/config`, always load it from `pageserver.toml`
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default, deny_unknown_fields)]
 pub struct PageServerConf {
+    // node id
    pub id: NodeId,
+
+    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
+
+    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
+
+    pub(crate) virtual_file_io_engine: Option<String>,
+    pub(crate) get_vectored_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -205,40 +139,8 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-        }
-    }
-}
-
-/// The toml that can be passed to `neon_local init --config`.
-/// This is a subset of the `pageserver.toml` configuration.
-// TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
-#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
-pub struct NeonLocalInitPageserverConf {
-    pub id: NodeId,
-    pub listen_pg_addr: String,
-    pub listen_http_addr: String,
-    pub pg_auth_type: AuthType,
-    pub http_auth_type: AuthType,
-    #[serde(flatten)]
-    pub other: HashMap<String, toml::Value>,
-}
-
-impl From<&NeonLocalInitPageserverConf> for PageServerConf {
-    fn from(conf: &NeonLocalInitPageserverConf) -> Self {
-        let NeonLocalInitPageserverConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            other: _,
-        } = conf;
-        Self {
-            id: *id,
-            listen_pg_addr: listen_pg_addr.clone(),
-            listen_http_addr: listen_http_addr.clone(),
-            pg_auth_type: *pg_auth_type,
-            http_auth_type: *http_auth_type,
+            virtual_file_io_engine: None,
+            get_vectored_impl: None,
        }
    }
 }
@@ -426,7 +328,41 @@ impl LocalEnv {
            .collect()
    }

-    ///  Construct `Self` from on-disk state.
+    /// Create a LocalEnv from a config file.
+    ///
+    /// Unlike 'load_config', this function fills in any defaults that are missing
+    /// from the config file.
+    pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
+        let mut env: LocalEnv = toml::from_str(toml)?;
+
+        // Find postgres binaries.
+        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
+        // Note that later in the code we assume, that distrib dirs follow the same pattern
+        // for all postgres versions.
+        if env.pg_distrib_dir == Path::new("") {
+            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
+                env.pg_distrib_dir = postgres_bin.into();
+            } else {
+                let cwd = env::current_dir()?;
+                env.pg_distrib_dir = cwd.join("pg_install")
+            }
+        }
+
+        // Find neon binaries.
+        if env.neon_distrib_dir == Path::new("") {
+            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
+        }
+
+        if env.pageservers.is_empty() {
+            anyhow::bail!("Configuration must contain at least one pageserver");
+        }
+
+        env.base_data_dir = base_path();
+
+        Ok(env)
+    }
+
+    /// Locate and load config
    pub fn load_config() -> anyhow::Result<Self> {
        let repopath = base_path();

@@ -440,129 +376,38 @@ impl LocalEnv {
        // TODO: check that it looks like a neon repository

        // load and parse file
-        let config_file_contents = fs::read_to_string(repopath.join("config"))?;
-        let on_disk_config: OnDiskConfig = toml::from_str(config_file_contents.as_str())?;
-        let mut env = {
-            let OnDiskConfig {
-                pg_distrib_dir,
-                neon_distrib_dir,
-                default_tenant_id,
-                private_key_path,
-                broker,
-                storage_controller,
-                pageservers,
-                safekeepers,
-                control_plane_api,
-                control_plane_compute_hook_api,
-                branch_name_mappings,
-            } = on_disk_config;
-            LocalEnv {
-                base_data_dir: repopath.clone(),
-                pg_distrib_dir,
-                neon_distrib_dir,
-                default_tenant_id,
-                private_key_path,
-                broker,
-                storage_controller,
-                pageservers,
-                safekeepers,
-                control_plane_api,
-                control_plane_compute_hook_api,
-                branch_name_mappings,
-            }
-        };
+        let config = fs::read_to_string(repopath.join("config"))?;
+        let mut env: LocalEnv = toml::from_str(config.as_str())?;

-        // The source of truth for pageserver configuration is the pageserver.toml.
-        assert!(
-            env.pageservers.is_empty(),
-            "we ensure this during deserialization"
-        );
-        env.pageservers = {
-            let iter = std::fs::read_dir(&repopath).context("open dir")?;
-            let mut pageservers = Vec::new();
-            for res in iter {
-                let dentry = res?;
-                const PREFIX: &str = "pageserver_";
-                let dentry_name = dentry
-                    .file_name()
-                    .into_string()
-                    .ok()
-                    .with_context(|| format!("non-utf8 dentry: {:?}", dentry.path()))
-                    .unwrap();
-                if !dentry_name.starts_with(PREFIX) {
-                    continue;
-                }
-                if !dentry.file_type().context("determine file type")?.is_dir() {
-                    anyhow::bail!("expected a directory, got {:?}", dentry.path());
-                }
-                let id = dentry_name[PREFIX.len()..]
-                    .parse::<NodeId>()
-                    .with_context(|| format!("parse id from {:?}", dentry.path()))?;
-                // TODO(christian): use pageserver_api::config::ConfigToml (PR #7656)
-                #[derive(serde::Serialize, serde::Deserialize)]
-                // (allow unknown fields, unlike PageServerConf)
-                struct PageserverConfigTomlSubset {
-                    id: NodeId,
-                    listen_pg_addr: String,
-                    listen_http_addr: String,
-                    pg_auth_type: AuthType,
-                    http_auth_type: AuthType,
-                }
-                let config_toml_path = dentry.path().join("pageserver.toml");
-                let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
-                    &std::fs::read_to_string(&config_toml_path)
-                        .with_context(|| format!("read {:?}", config_toml_path))?,
-                )
-                .context("parse pageserver.toml")?;
-                let PageserverConfigTomlSubset {
-                    id: config_toml_id,
-                    listen_pg_addr,
-                    listen_http_addr,
-                    pg_auth_type,
-                    http_auth_type,
-                } = config_toml;
-                let conf = PageServerConf {
-                    id: {
-                        anyhow::ensure!(
-                            config_toml_id == id,
-                            "id mismatch: config_toml.id={config_toml_id} id={id}",
-                        );
-                        id
-                    },
-                    listen_pg_addr,
-                    listen_http_addr,
-                    pg_auth_type,
-                    http_auth_type,
-                };
-                pageservers.push(conf);
-            }
-            pageservers
-        };
+        env.base_data_dir = repopath;

        Ok(env)
    }

-    pub fn persist_config(&self) -> anyhow::Result<()> {
-        Self::persist_config_impl(
-            &self.base_data_dir,
-            &OnDiskConfig {
-                pg_distrib_dir: self.pg_distrib_dir.clone(),
-                neon_distrib_dir: self.neon_distrib_dir.clone(),
-                default_tenant_id: self.default_tenant_id,
-                private_key_path: self.private_key_path.clone(),
-                broker: self.broker.clone(),
-                storage_controller: self.storage_controller.clone(),
-                pageservers: vec![], // it's skip_serializing anyway
-                safekeepers: self.safekeepers.clone(),
-                control_plane_api: self.control_plane_api.clone(),
-                control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
-                branch_name_mappings: self.branch_name_mappings.clone(),
-            },
-        )
-    }
+    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
+        // Currently, the user first passes a config file with 'neon_local init --config=<path>'
+        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
+        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
+        // a bit sad.
+        let mut conf_content = r#"# This file describes a local deployment of the page server
+# and safekeeeper node. It is read by the 'neon_local' command-line
+# utility.
+"#
+        .to_string();
+
+        // Convert the LocalEnv to a toml file.
+        //
+        // This could be as simple as this:
+        //
+        // conf_content += &toml::to_string_pretty(env)?;
+        //
+        // But it results in a "values must be emitted before tables". I'm not sure
+        // why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
+        // Maybe rust reorders the fields to squeeze avoid padding or something?
+        // In any case, converting to toml::Value first, and serializing that, works.
+        // See https://github.com/alexcrichton/toml-rs/issues/142
+        conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?;

-    pub fn persist_config_impl(base_path: &Path, config: &OnDiskConfig) -> anyhow::Result<()> {
-        let conf_content = &toml::to_string_pretty(config)?;
        let target_config_path = base_path.join("config");
        fs::write(&target_config_path, conf_content).with_context(|| {
            format!(
@@ -587,13 +432,17 @@ impl LocalEnv {
        }
    }

-    /// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
-    pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
-        let base_path = base_path();
-        assert_ne!(base_path, Path::new(""));
-        let base_path = &base_path;
+    //
+    // Initialize a new Neon repository
+    //
+    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
+        // check if config already exists
+        let base_path = &self.base_data_dir;
+        ensure!(
+            base_path != Path::new(""),
+            "repository base path is missing"
+        );

-        // create base_path dir
        if base_path.exists() {
            match force {
                InitForceMode::MustNotExist => {
@@ -625,96 +474,70 @@ impl LocalEnv {
                }
            }
        }
+
+        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
+            bail!(
+                "Can't find postgres binary at {}",
+                self.pg_bin_dir(pg_version)?.display()
+            );
+        }
+        for binary in ["pageserver", "safekeeper"] {
+            if !self.neon_distrib_dir.join(binary).exists() {
+                bail!(
+                    "Can't find binary '{binary}' in neon distrib dir '{}'",
+                    self.neon_distrib_dir.display()
+                );
+            }
+        }
+
        if !base_path.exists() {
            fs::create_dir(base_path)?;
        }

-        let NeonLocalInitConf {
-            pg_distrib_dir,
-            neon_distrib_dir,
-            default_tenant_id,
-            broker,
-            storage_controller,
-            pageservers,
-            safekeepers,
-            control_plane_api,
-            control_plane_compute_hook_api,
-        } = conf;
-
-        // Find postgres binaries.
-        // Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
-        // Note that later in the code we assume, that distrib dirs follow the same pattern
-        // for all postgres versions.
-        let pg_distrib_dir = pg_distrib_dir.unwrap_or_else(|| {
-            if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
-                postgres_bin.into()
-            } else {
-                let cwd = env::current_dir().unwrap();
-                cwd.join("pg_install")
-            }
-        });
-
-        // Find neon binaries.
-        let neon_distrib_dir = neon_distrib_dir
-            .unwrap_or_else(|| env::current_exe().unwrap().parent().unwrap().to_owned());
-
        // Generate keypair for JWT.
        //
        // The keypair is only needed if authentication is enabled in any of the
        // components. For convenience, we generate the keypair even if authentication
        // is not enabled, so that you can easily enable it after the initialization
-        // step.
-        generate_auth_keys(
-            base_path.join("auth_private_key.pem").as_path(),
-            base_path.join("auth_public_key.pem").as_path(),
-        )
-        .context("generate auth keys")?;
-        let private_key_path = PathBuf::from("auth_private_key.pem");
-
-        // create the runtime type because the remaining initialization code below needs
-        // a LocalEnv instance op operation
-        // TODO: refactor to avoid this, LocalEnv should only be constructed from on-disk state
-        let env = LocalEnv {
-            base_data_dir: base_path.clone(),
-            pg_distrib_dir,
-            neon_distrib_dir,
-            default_tenant_id: Some(default_tenant_id),
-            private_key_path,
-            broker,
-            storage_controller: storage_controller.unwrap_or_default(),
-            pageservers: pageservers.iter().map(Into::into).collect(),
-            safekeepers,
-            control_plane_api: control_plane_api.unwrap_or_default(),
-            control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
-            branch_name_mappings: Default::default(),
-        };
-
-        // create endpoints dir
-        fs::create_dir_all(env.endpoints_path())?;
-
-        // create safekeeper dirs
-        for safekeeper in &env.safekeepers {
-            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
+        // step. However, if the key generation fails, we treat it as non-fatal if
+        // authentication was not enabled.
+        if self.private_key_path == PathBuf::new() {
+            match generate_auth_keys(
+                base_path.join("auth_private_key.pem").as_path(),
+                base_path.join("auth_public_key.pem").as_path(),
+            ) {
+                Ok(()) => {
+                    self.private_key_path = PathBuf::from("auth_private_key.pem");
+                }
+                Err(e) => {
+                    if !self.auth_keys_needed() {
+                        eprintln!("Could not generate keypair for JWT authentication: {e}");
+                        eprintln!("Continuing anyway because authentication was not enabled");
+                        self.private_key_path = PathBuf::from("auth_private_key.pem");
+                    } else {
+                        return Err(e);
+                    }
+                }
+            }
        }

-        // initialize pageserver state
-        for (i, ps) in pageservers.into_iter().enumerate() {
-            let runtime_ps = &env.pageservers[i];
-            assert_eq!(&PageServerConf::from(&ps), runtime_ps);
-            fs::create_dir(env.pageserver_data_dir(ps.id))?;
-            PageServerNode::from_env(&env, runtime_ps)
-                .initialize(ps)
-                .context("pageserver init failed")?;
+        fs::create_dir_all(self.endpoints_path())?;
+
+        for safekeeper in &self.safekeepers {
+            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
        }

-        // setup remote remote location for default LocalFs remote storage
-        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+        self.persist_config(base_path)
+    }

-        env.persist_config()
+    fn auth_keys_needed(&self) -> bool {
+        self.pageservers.iter().any(|ps| {
+            ps.pg_auth_type == AuthType::NeonJWT || ps.http_auth_type == AuthType::NeonJWT
+        }) || self.safekeepers.iter().any(|sk| sk.auth_enabled)
    }
 }

-pub fn base_path() -> PathBuf {
+fn base_path() -> PathBuf {
    match std::env::var_os("NEON_REPO_DIR") {
        Some(val) => PathBuf::from(val),
        None => PathBuf::from(".neon"),
@@ -757,3 +580,31 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
    }
    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn simple_conf_parsing() {
+        let simple_conf_toml = include_str!("../simple.conf");
+        let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
+        assert!(
+            simple_conf_parse_result.is_ok(),
+            "failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
+        );
+
+        let string_to_replace = "listen_addr = '127.0.0.1:50051'";
+        let spoiled_url_str = "listen_addr = '!@$XOXO%^&'";
+        let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
+        assert!(
+            spoiled_url_toml.contains(spoiled_url_str),
+            "Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
+        );
+        let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
+        assert!(
+            spoiled_url_parse_result.is_err(),
+            "expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
+        );
+    }
+}
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -4,21 +4,21 @@
 //!
 //!   .neon/
 //!
+use std::borrow::Cow;
 use std::collections::HashMap;

 use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::str::FromStr;
+use std::process::Command;
 use std::time::Duration;

 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
 use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
-    TimelineInfo,
+    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -30,7 +30,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::local_env::{NeonLocalInitPageserverConf, PageServerConf};
+use crate::local_env::PageServerConf;
 use crate::{background_process, local_env::LocalEnv};

 /// Directory within .neon which will be used by default for LocalFs remote storage.
@@ -74,23 +74,57 @@ impl PageServerNode {
        }
    }

-    fn pageserver_init_make_toml(
-        &self,
-        conf: NeonLocalInitPageserverConf,
-    ) -> anyhow::Result<toml_edit::Document> {
-        assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
-
-        // TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
-
+    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
+    ///
+    /// These all end up on the command line of the `pageserver` binary.
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

+        let PageServerConf {
+            id,
+            listen_pg_addr,
+            listen_http_addr,
+            pg_auth_type,
+            http_auth_type,
+            virtual_file_io_engine,
+            get_vectored_impl,
+        } = &self.conf;
+
+        let id = format!("id={}", id);
+
+        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
+        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
+            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
+        } else {
+            String::new()
+        };
+        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
+            format!("get_vectored_impl='{get_vectored_impl}'")
+        } else {
+            String::new()
+        };
+
        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

-        let mut overrides = vec![pg_distrib_dir_param, broker_endpoint_param];
+        let mut overrides = vec![
+            id,
+            pg_distrib_dir_param,
+            http_auth_type_param,
+            pg_auth_type_param,
+            listen_http_addr_param,
+            listen_pg_addr_param,
+            broker_endpoint_param,
+            virtual_file_io_engine,
+            get_vectored_impl,
+        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
            overrides.push(format!(
@@ -100,7 +134,7 @@ impl PageServerNode {

            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(conf.http_auth_type, AuthType::NeonJWT) {
+            if matches!(http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -109,40 +143,31 @@ impl PageServerNode {
            }
        }

-        if !conf.other.contains_key("remote_storage") {
+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
            overrides.push(format!(
                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
            ));
        }

-        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
+        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
        }

        // Apply the user-provided overrides
-        overrides.push(
-            toml_edit::ser::to_string_pretty(&conf)
-                .expect("we deserialized this from toml earlier"),
-        );
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));

-        // Turn `overrides` into a toml document.
-        // TODO: above code is legacy code, it should be refactored to use toml_edit directly.
-        let mut config_toml = toml_edit::Document::new();
-        for fragment_str in overrides {
-            let fragment = toml_edit::Document::from_str(&fragment_str)
-                .expect("all fragments in `overrides` are valid toml documents, this function controls that");
-            for (key, item) in fragment.iter() {
-                config_toml.insert(key, item.clone());
-            }
-        }
-        Ok(config_toml)
+        overrides
    }

    /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
-        self.pageserver_init(conf)
+    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
+        self.pageserver_init(config_overrides)
            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
    }

@@ -158,11 +183,11 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<()> {
-        self.start_node().await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false).await
    }

-    fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
+    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
        let node_id = self.conf.id;
        println!(
@@ -173,20 +198,29 @@ impl PageServerNode {
        );
        io::stdout().flush()?;

-        let config = self
-            .pageserver_init_make_toml(conf)
-            .context("make pageserver toml")?;
-        let config_file_path = datadir.join("pageserver.toml");
-        let mut config_file = std::fs::OpenOptions::new()
-            .create_new(true)
-            .write(true)
-            .open(&config_file_path)
-            .with_context(|| format!("open pageserver toml for write: {config_file_path:?}"))?;
-        config_file
-            .write_all(config.to_string().as_bytes())
-            .context("write pageserver toml")?;
-        drop(config_file);
-        // TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
+        if !datadir.exists() {
+            std::fs::create_dir(&datadir)?;
+        }
+
+        let datadir_path_str = datadir.to_str().with_context(|| {
+            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
+        })?;
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        args.push(Cow::Borrowed("--init"));
+
+        let init_output = Command::new(self.env.pageserver_bin())
+            .args(args.iter().map(Cow::as_ref))
+            .envs(self.pageserver_env_variables()?)
+            .output()
+            .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
+
+        anyhow::ensure!(
+            init_output.status.success(),
+            "Pageserver init for node {} did not finish successfully, stdout: {}, stderr: {}",
+            node_id,
+            String::from_utf8_lossy(&init_output.stdout),
+            String::from_utf8_lossy(&init_output.stderr),
+        );

        // Write metadata file, used by pageserver on startup to register itself with
        // the storage controller
@@ -200,13 +234,12 @@ impl PageServerNode {
        // situation: the metadata is written by some other script.
        std::fs::write(
            metadata_path,
-            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
-                postgres_host: "localhost".to_string(),
-                postgres_port: self.pg_connection_config.port(),
-                http_host: "localhost".to_string(),
-                http_port,
-                other: HashMap::new(),
-            })
+            serde_json::to_vec(&serde_json::json!({
+                "host": "localhost",
+                "port": self.pg_connection_config.port(),
+                "http_host": "localhost",
+                "http_port": http_port,
+            }))
            .unwrap(),
        )
        .expect("Failed to write metadata file");
@@ -214,7 +247,11 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(&self) -> anyhow::Result<()> {
+    async fn start_node(
+        &self,
+        config_overrides: &[&str],
+        update_config: bool,
+    ) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -231,12 +268,15 @@ impl PageServerNode {
                self.conf.id, datadir,
            )
        })?;
-        let args = vec!["-D", datadir_path_str];
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        if update_config {
+            args.push(Cow::Borrowed("--update-config"));
+        }
        background_process::start_process(
            "pageserver",
            &datadir,
            &self.env.pageserver_bin(),
-            args,
+            args.iter().map(Cow::as_ref),
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
@@ -253,6 +293,22 @@ impl PageServerNode {
        Ok(())
    }

+    fn pageserver_basic_args<'a>(
+        &self,
+        config_overrides: &'a [&'a str],
+        datadir_path_str: &'a str,
+    ) -> Vec<Cow<'a, str>> {
+        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
+
+        let overrides = self.neon_local_overrides(config_overrides);
+        for config_override in overrides {
+            args.push(Cow::Borrowed("-c"));
+            args.push(Cow::Owned(config_override));
+        }
+
+        args
+    }
+
    fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
@@ -378,11 +434,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
-            switch_aux_file_policy: settings
-                .remove("switch_aux_file_policy")
-                .map(|x| x.parse::<AuxFilePolicy>())
-                .transpose()
-                .context("Failed to parse 'switch_aux_file_policy'")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -501,11 +552,6 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
-                switch_aux_file_policy: settings
-                    .remove("switch_aux_file_policy")
-                    .map(|x| x.parse::<AuxFilePolicy>())
-                    .transpose()
-                    .context("Failed to parse 'switch_aux_file_policy'")?,
            }
        };

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,8 +1,6 @@
-use crate::{
-    background_process,
-    local_env::{LocalEnv, NeonStorageControllerConf},
-};
+use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
+use hyper::Method;
 use pageserver_api::{
    controller_api::{
        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -16,7 +14,6 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -35,13 +32,15 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
-    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
-            config: env.storage_controller.clone(),
        }
    }

@@ -274,6 +272,8 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -379,7 +379,7 @@ impl StorageController {
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
-        method: reqwest::Method,
+        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> anyhow::Result<RS>
@@ -472,16 +472,6 @@ impl StorageController {
            .await
    }

-    #[instrument(skip(self))]
-    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch::<(), TenantCreateResponse>(
-            Method::POST,
-            format!("debug/v1/tenant/{tenant_id}/import"),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,6 +1,7 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
+use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
@@ -13,7 +14,7 @@ use pageserver_api::{
    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::{Method, StatusCode, Url};
+use reqwest::Url;
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};

@@ -231,7 +232,7 @@ impl Client {
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
-        method: Method,
+        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> mgmt_api::Result<RS>
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -30,7 +30,7 @@ The storage controller uses a postgres database to persist a subset of its state
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.

-The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.

 The `diesel` crate is used for defining models & migrations.

--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 serde.workspace = true
+serde_with.workspace = true
 serde_json.workspace = true
 regex.workspace = true

--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,23 +33,6 @@ pub struct ComputeSpec {
    #[serde(default)]
    pub features: Vec<ComputeFeature>,

-    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
-    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
-    /// received.
-    ///
-    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
-    /// spec generation doesn't need to be aware of the actual compute it's running on, while
-    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
-    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
-    /// giving every VM much more swap than it should have (32GiB).
-    ///
-    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
-    /// enabling the swap resizing behavior once rollout is complete.
-    ///
-    /// See neondatabase/cloud#12047 for more.
-    #[serde(default)]
-    pub swap_size_bytes: Option<u64>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
--- a/libs/consumption_metrics/Cargo.toml
+++ b/libs/consumption_metrics/Cargo.toml
@@ -5,8 +5,11 @@ edition = "2021"
 license = "Apache-2.0"

 [dependencies]
+anyhow.workspace = true
 chrono.workspace = true
 rand.workspace = true
 serde.workspace = true
+serde_with.workspace = true
+utils.workspace = true

 workspace_hack.workspace = true
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,16 +256,7 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-
-    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
-    #[cfg(target_os = "macos")]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
-    }
-    #[cfg(not(target_os = "macos"))]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss);
-    }
+    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }

 fn get_rusage_stats() -> libc::rusage {
@@ -480,15 +471,6 @@ impl<A: CounterPairAssoc> CounterPairVec<A> {
        let id = self.vec.with_labels(labels);
        self.vec.remove_metric(id)
    }
-
-    pub fn sample(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) -> u64 {
-        let id = self.vec.with_labels(labels);
-        let metric = self.vec.get_metric(id);
-
-        let inc = metric.inc.count.load(std::sync::atomic::Ordering::Relaxed);
-        let dec = metric.dec.count.load(std::sync::atomic::Ordering::Relaxed);
-        inc.saturating_sub(dec)
-    }
 }

 impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,31 +0,0 @@
-use std::collections::HashMap;
-
-use const_format::formatcp;
-
-#[cfg(test)]
-mod tests;
-
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
-
-// Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
-// itself, it is only used for registering the pageserver with the control
-// plane and/or storage controller.
-//
-#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
-pub struct NodeMetadata {
-    #[serde(rename = "host")]
-    pub postgres_host: String,
-    #[serde(rename = "port")]
-    pub postgres_port: u16,
-    pub http_host: String,
-    pub http_port: u16,
-
-    // Deployment tools may write fields to the metadata file beyond what we
-    // use in this type: this type intentionally only names fields that require.
-    #[serde(flatten)]
-    pub other: HashMap<String, serde_json::Value>,
-}
--- a/libs/pageserver_api/src/config/tests.rs
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -1,22 +0,0 @@
-use super::*;
-
-#[test]
-fn test_node_metadata_v1_backward_compatibilty() {
-    let v1 = serde_json::to_vec(&serde_json::json!({
-        "host": "localhost",
-        "port": 23,
-        "http_host": "localhost",
-        "http_port": 42,
-    }));
-
-    assert_eq!(
-        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
-        NodeMetadata {
-            postgres_host: "localhost".to_string(),
-            postgres_port: 23,
-            http_host: "localhost".to_string(),
-            http_port: 42,
-            other: HashMap::new(),
-        }
-    )
-}
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -22,107 +21,15 @@ pub struct Key {
    pub field6: u32,
 }

-/// The storage key size.
 pub const KEY_SIZE: usize = 18;

-/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
-/// See [`Key::to_i128`] for more information on the encoding.
-pub const METADATA_KEY_SIZE: usize = 16;
-
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
-pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
-
-/// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x61;
-
-/// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x62;
-
-/// Check if the key falls in the range of metadata keys.
-pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
-}
-
 impl Key {
-    /// Check if the key falls in the range of metadata keys.
-    pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
-        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key(key: &[u8]) -> Self {
-        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
-    }
-
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
-    /// Get the range of metadata keys.
-    pub const fn metadata_key_range() -> Range<Self> {
-        Key {
-            field1: METADATA_KEY_BEGIN_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: METADATA_KEY_END_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
-    /// Get the range of aux keys.
-    pub fn metadata_aux_key_range() -> Range<Self> {
-        Key {
-            field1: AUX_KEY_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: AUX_KEY_PREFIX + 1,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0x7F) as i128) << 120)
+        (((self.field1 & 0xf) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -132,7 +39,7 @@ impl Key {

    pub const fn from_i128(x: i128) -> Self {
        Key {
-            field1: ((x >> 120) & 0x7F) as u8,
+            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
            field3: (x >> 72) as u32,
            field4: (x >> 40) as u32,
@@ -141,11 +48,11 @@ impl Key {
        }
    }

-    pub const fn next(&self) -> Key {
+    pub fn next(&self) -> Key {
        self.add(1)
    }

-    pub const fn add(&self, x: u32) -> Key {
+    pub fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -174,8 +81,6 @@ impl Key {
        key
    }

-    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -187,8 +92,6 @@ impl Key {
        }
    }

-    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -572,17 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

-/// Non inherited range for vectored get.
-pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
-/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
-
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
+    key != AUX_FILES_KEY
 }

 #[inline(always)]
@@ -658,14 +556,11 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;

-    use crate::key::is_metadata_key_slice;
    use crate::key::Key;

    use rand::Rng;
    use rand::SeedableRng;

-    use super::AUX_KEY_PREFIX;
-
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -681,16 +576,4 @@ mod tests {

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
-
-    #[test]
-    fn test_metadata_keys() {
-        let mut metadata_key = vec![AUX_KEY_PREFIX];
-        metadata_key.extend_from_slice(&[0xFF; 15]);
-        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
-        assert_eq!(metadata_key, output_key);
-        assert!(encoded_key.is_metadata_key());
-        assert!(is_metadata_key_slice(&metadata_key));
-    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,10 +1,7 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;

-use crate::{
-    key::Key,
-    shard::{ShardCount, ShardIdentity},
-};
+use crate::key::Key;
 use itertools::Itertools;

 ///
@@ -17,279 +14,44 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-/// A wrapper type for sparse keyspaces.
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
-pub struct SparseKeySpace(pub KeySpace);
-
-/// Represents a contiguous half-open range of the keyspace, masked according to a particular
-/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
-/// shard.
-///
-/// When we iterate over keys within this object, we will skip any keys that don't belong
-/// to this shard.
-///
-/// The start + end keys may not belong to the shard: these specify where layer files should
-/// start  + end, but we will never actually read/write those keys.
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct ShardedRange<'a> {
-    pub shard_identity: &'a ShardIdentity,
-    pub range: Range<Key>,
-}
-
-// Calculate the size of a range within the blocks of the same relation, or spanning only the
-// top page in the previous relation's space.
-fn contiguous_range_len(range: &Range<Key>) -> u32 {
-    debug_assert!(is_contiguous_range(range));
-    if range.start.field6 == 0xffffffff {
-        range.end.field6 + 1
-    } else {
-        range.end.field6 - range.start.field6
-    }
-}
-
-/// Return true if this key range includes only keys in the same relation's data blocks, or
-/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
-///
-/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
-/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
-/// of a given contiguous range is present on one shard.
-///
-/// This matters, because:
-/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
-/// - Within such ranges, we may calculate distances using simple subtraction of field6.
-fn is_contiguous_range(range: &Range<Key>) -> bool {
-    range.start.field1 == range.end.field1
-        && range.start.field2 == range.end.field2
-        && range.start.field3 == range.end.field3
-        && range.start.field4 == range.end.field4
-        && (range.start.field5 == range.end.field5
-            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
-}
-
-impl<'a> ShardedRange<'a> {
-    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
-        Self {
-            shard_identity,
-            range,
-        }
-    }
-
-    /// Break up this range into chunks, each of which has at least one local key in it if the
-    /// total range has at least one local key.
-    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
-        // Optimization for single-key case (e.g. logical size keys)
-        if self.range.end == self.range.start.add(1) {
-            return vec![(
-                if self.shard_identity.is_key_disposable(&self.range.start) {
-                    0
-                } else {
-                    1
-                },
-                self.range,
-            )];
-        }
-
-        if !is_contiguous_range(&self.range) {
-            // Ranges that span relations are not fragmented.  We only get these ranges as a result
-            // of operations that act on existing layers, so we trust that the existing range is
-            // reasonably small.
-            return vec![(u32::MAX, self.range)];
-        }
-
-        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
-
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            let advance_by = self.distance_to_next_boundary(cursor);
-            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
-
-            // If the previous fragment is undersized, then we seek to consume enough
-            // blocks to complete it.
-            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
-                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
-                Some(frag) => {
-                    // Prev block is complete, want the full number.
-                    (
-                        target_nblocks,
-                        if is_fragment_disposable {
-                            // If this current range will be empty (not shard-local data), we will merge into previous
-                            Some(frag)
-                        } else {
-                            None
-                        },
-                    )
-                }
-                None => {
-                    // First iteration, want the full number
-                    (target_nblocks, None)
-                }
-            };
-
-            let advance_by = if is_fragment_disposable {
-                advance_by
-            } else {
-                std::cmp::min(advance_by, want_blocks)
-            };
-
-            let next_cursor = cursor.add(advance_by);
-
-            let this_frag = (
-                if is_fragment_disposable {
-                    0
-                } else {
-                    advance_by
-                },
-                cursor..next_cursor,
-            );
-            cursor = next_cursor;
-
-            if let Some(last_fragment) = merge_last_fragment {
-                // Previous fragment was short or this one is empty, merge into it
-                last_fragment.0 += this_frag.0;
-                last_fragment.1.end = this_frag.1.end;
-            } else {
-                fragments.push(this_frag);
-            }
-        }
-
-        fragments
-    }
-
-    /// Estimate the physical pages that are within this range, on this shard.  This returns
-    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
-    pub fn page_count(&self) -> u32 {
-        // Special cases for single keys like logical sizes
-        if self.range.end == self.range.start.add(1) {
-            return if self.shard_identity.is_key_disposable(&self.range.start) {
-                0
-            } else {
-                1
-            };
-        }
-
-        // We can only do an authentic calculation of contiguous key ranges
-        if !is_contiguous_range(&self.range) {
-            return u32::MAX;
-        }
-
-        // Special case for single sharded tenants: our logical and physical sizes are the same
-        if self.shard_identity.count < ShardCount::new(2) {
-            return contiguous_range_len(&self.range);
-        }
-
-        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
-        // to Self, and add the stripe's block count to our total if so.
-        let mut result: u64 = 0;
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            // Count up to the next stripe_size boundary or end of range
-            let advance_by = self.distance_to_next_boundary(cursor);
-
-            // If this blocks in this stripe belong to us, add them to our count
-            if !self.shard_identity.is_key_disposable(&cursor) {
-                result += advance_by as u64;
-            }
-
-            cursor = cursor.add(advance_by);
-        }
-
-        if result > u32::MAX as u64 {
-            u32::MAX
-        } else {
-            result as u32
-        }
-    }
-
-    /// Advance the cursor to the next potential fragment boundary: this is either
-    /// a stripe boundary, or the end of the range.
-    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
-        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
-
-        if self.shard_identity.count < ShardCount::new(2) {
-            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
-            return distance_to_range_end;
-        }
-
-        if cursor.field6 == 0xffffffff {
-            // We are wrapping from one relation's logical size to the next relation's first data block
-            return 1;
-        }
-
-        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
-        let stripe_remainder = self.shard_identity.stripe_size.0
-            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
-
-        if cfg!(debug_assertions) {
-            // We should never overflow field5 and field6 -- our callers check this earlier
-            // and would have returned their u32::MAX cases if the input range violated this.
-            let next_cursor = cursor.add(stripe_remainder);
-            debug_assert!(
-                next_cursor.field1 == cursor.field1
-                    && next_cursor.field2 == cursor.field2
-                    && next_cursor.field3 == cursor.field3
-                    && next_cursor.field4 == cursor.field4
-                    && next_cursor.field5 == cursor.field5
-            )
-        }
-
-        std::cmp::min(stripe_remainder, distance_to_range_end)
-    }
-
-    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
-    /// this function simply calculates the number of pages in the space, without accounting for those
-    /// pages that would not actually be stored on this node.
-    ///
-    /// Don't use this function in code that works with physical entities like layer files.
-    pub fn raw_size(range: &Range<Key>) -> u32 {
-        if is_contiguous_range(range) {
-            contiguous_range_len(range)
-        } else {
-            u32::MAX
-        }
-    }
-}
-
 impl KeySpace {
-    /// Create a key space with a single range.
-    pub fn single(key_range: Range<Key>) -> Self {
-        Self {
-            ranges: vec![key_range],
-        }
-    }
-
+    ///
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
+        let target_nblocks = (target_size / BLCKSZ as u64) as usize;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
        let mut current_part_size: usize = 0;
        for range in &self.ranges {
-            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
-            // will respect shard striping rather than assuming all keys within a range are present.
-            let range = ShardedRange::new(range.clone(), shard_identity);
-
-            // Chunk up the range into parts that each contain up to target_size local blocks
-            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
-                // If appending the next contiguous range in the keyspace to the current
-                // partition would cause it to be too large, and our current partition
-                // covers at least one block that is physically present in this shard,
-                // then start a new partition
-                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
-                    && current_part_size > 0
-                {
-                    parts.push(KeySpace {
-                        ranges: current_part,
-                    });
-                    current_part = Vec::new();
-                    current_part_size = 0;
-                }
-                current_part.push(frag_range.start..frag_range.end);
-                current_part_size += frag_on_shard_size as usize;
+            // If appending the next contiguous range in the keyspace to the current
+            // partition would cause it to be too large, start a new partition.
+            let this_size = key_range_size(range) as usize;
+            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
+                parts.push(KeySpace {
+                    ranges: current_part,
+                });
+                current_part = Vec::new();
+                current_part_size = 0;
            }
+
+            // If the next range is larger than 'target_size', split it into
+            // 'target_size' chunks.
+            let mut remain_size = this_size;
+            let mut start = range.start;
+            while remain_size > target_nblocks {
+                let next = start.add(target_nblocks as u32);
+                parts.push(KeySpace {
+                    ranges: vec![start..next],
+                });
+                start = next;
+                remain_size -= target_nblocks
+            }
+            current_part.push(start..range.end);
+            current_part_size += remain_size;
        }

        // add last partition that wasn't full yet.
@@ -302,10 +64,6 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

-    pub fn is_empty(&self) -> bool {
-        self.total_raw_size() == 0
-    }
-
    /// Merge another keyspace into the current one.
    /// Note: the keyspaces must not ovelap (enforced via assertions)
    pub fn merge(&mut self, other: &KeySpace) {
@@ -336,13 +94,12 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    /// Returns the removed keyspace
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return KeySpace::default();
+                return;
            }
        };

@@ -355,37 +112,30 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

-        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
-                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
-                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
-                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
-                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
-
-        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -396,11 +146,11 @@ impl KeySpace {
        self.ranges.last().map(|range| range.end)
    }

-    /// The size of the keyspace in pages, before accounting for sharding
-    pub fn total_raw_size(&self) -> usize {
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
        self.ranges
            .iter()
-            .map(|range| ShardedRange::raw_size(range) as usize)
+            .map(|range| key_range_size(range) as usize)
            .sum()
    }

@@ -420,11 +170,6 @@ impl KeySpace {
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
        self.overlaps_at(range).is_some()
    }
-
-    /// Check if the keyspace contains a key
-    pub fn contains(&self, key: &Key) -> bool {
-        self.overlaps(&(*key..key.next()))
-    }
 }

 ///
@@ -439,33 +184,10 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

-/// Represents a partitioning of the sparse key space.
-#[derive(Clone, Debug, Default)]
-pub struct SparseKeyPartitioning {
-    pub parts: Vec<SparseKeySpace>,
-}
-
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
-
-    /// Convert a key partitioning to a sparse partition.
-    pub fn into_sparse(self) -> SparseKeyPartitioning {
-        SparseKeyPartitioning {
-            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
-        }
-    }
-}
-
-impl SparseKeyPartitioning {
-    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
-    /// cause long/dead loops.
-    pub fn into_dense(self) -> KeyPartitioning {
-        KeyPartitioning {
-            parts: self.parts.into_iter().map(|x| x.0).collect(),
-        }
-    }
 }

 ///
@@ -497,7 +219,7 @@ impl KeySpaceAccum {

    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += ShardedRange::raw_size(&range) as u64;
+        self.size += key_range_size(&range) as u64;

        match self.accum.as_mut() {
            Some(accum) => {
@@ -529,9 +251,7 @@ impl KeySpaceAccum {
        std::mem::take(self).to_keyspace()
    }

-    // The total number of keys in this object, ignoring any sharding effects that might cause some of
-    // the keys to be omitted in storage on this shard.
-    pub fn raw_size(&self) -> u64 {
+    pub fn size(&self) -> u64 {
        self.size
    }
 }
@@ -587,19 +307,36 @@ impl KeySpaceRandomAccum {
    }
 }

+#[inline(always)]
+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
 pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
-    use crate::{
-        models::ShardParameters,
-        shard::{ShardCount, ShardNumber},
-    };
-
    use super::*;
    use std::fmt::Write;

@@ -642,17 +379,14 @@ mod tests {
            accum.add_range(range.clone());
        }

-        let expected_size: u64 = ranges
-            .iter()
-            .map(|r| ShardedRange::raw_size(r) as u64)
-            .sum();
-        assert_eq!(accum.raw_size(), expected_size);
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);

        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        for range in &ranges {
            accum.add_range(range.clone());
@@ -819,16 +553,7 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(2)..Key::from_i128(3),
-                Key::from_i128(6)..Key::from_i128(7),
-                Key::from_i128(11)..Key::from_i128(12),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -858,17 +583,7 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(3)..Key::from_i128(5),
-                Key::from_i128(8)..Key::from_i128(10),
-                Key::from_i128(14)..Key::from_i128(15),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -895,11 +610,7 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace::default();
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -926,17 +637,7 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(9)..Key::from_i128(10),
-                Key::from_i128(12)..Key::from_i128(15),
-                Key::from_i128(17)..Key::from_i128(19),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -949,412 +650,4 @@ mod tests {
            ]
        );
    }
-    #[test]
-    fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
-                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Key range spans relations, expect MAX
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
-                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
-            },
-            &shard_identity,
-        );
-        // Single-key range on logical size key
-        assert_eq!(range.page_count(), 1);
-    }
-
-    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
-    #[test]
-    fn contiguous_range_check() {
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
-        ),);
-
-        // The ranges goes all the way up to the 0xffffffff, including it: this is
-        // not considered a rel block range because 0xffffffff stores logical sizes,
-        // not blocks.
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
-        ),);
-
-        // Keys within the normal data region of a relation
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
-        ),);
-
-        // The logical size key of one forkno, then some blocks in the next
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
-        ),);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
-                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Range spanning the end of one forkno and the start of the next: we do not attempt to
-        // calculate a valid size, because we have no way to know if they keys between start
-        // and end are actually in use.
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_one_relation() {
-        for shard_number in 0..4 {
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
-            )
-            .unwrap();
-
-            let range = ShardedRange::new(
-                Range {
-                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
-                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
-                },
-                &shard_identity,
-            );
-
-            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
-            if shard_number == 0 {
-                assert_eq!(range.page_count(), 1);
-            } else {
-                // Other shards should perceive the range's size as zero
-                assert_eq!(range.page_count(), 0);
-            }
-        }
-    }
-
-    /// Test helper: construct a ShardedRange and call fragment() on it, returning
-    /// the total page count in the range and the fragments.
-    fn do_fragment(
-        range_start: Key,
-        range_end: Key,
-        shard_identity: &ShardIdentity,
-        target_nblocks: u32,
-    ) -> (u32, Vec<(u32, Range<Key>)>) {
-        let range = ShardedRange::new(
-            Range {
-                start: range_start,
-                end: range_end,
-            },
-            shard_identity,
-        );
-
-        let page_count = range.page_count();
-        let fragments = range.fragment(target_nblocks);
-
-        // Invariant: we always get at least one fragment
-        assert!(!fragments.is_empty());
-
-        // Invariant: the first/last fragment start/end should equal the input start/end
-        assert_eq!(fragments.first().unwrap().1.start, range_start);
-        assert_eq!(fragments.last().unwrap().1.end, range_end);
-
-        if page_count > 0 {
-            // Invariant: every fragment must contain at least one shard-local page, if the
-            // total range contains at least one shard-local page
-            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
-            if !all_nonzero {
-                eprintln!("Found a zero-length fragment: {:?}", fragments);
-            }
-            assert!(all_nonzero);
-        } else {
-            // A range with no shard-local pages should always be returned as a single fragment
-            assert_eq!(fragments, vec![(0, range_start..range_end)]);
-        }
-
-        // Invariant: fragments must be ordered and non-overlapping
-        let mut last: Option<Range<Key>> = None;
-        for frag in &fragments {
-            if let Some(last) = last {
-                assert!(frag.1.start >= last.end);
-                assert!(frag.1.start > last.start);
-            }
-            last = Some(frag.1.clone())
-        }
-
-        // Invariant: fragments respect target_nblocks
-        for frag in &fragments {
-            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
-        }
-
-        (page_count, fragments)
-    }
-
-    /// Really simple tests for fragment(), on a range that just contains a single stripe
-    /// for a single tenant.
-    #[test]
-    fn sharded_range_fragment_simple() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which we happen to know covers exactly one stripe which belongs to this shard
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
-
-        // Ask for stripe_size blocks, we get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for more, we still get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for target_nblocks of half the stripe size, we get two halves
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
-            (
-                32768,
-                vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_multi_stripe() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        // Ask for all the blocks, get a fragment that covers the whole range but reports
-        // its size to be just the blocks belonging to our shard.
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for a sub-stripe quantity
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
-            (
-                32768,
-                vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
-                ]
-            )
-        );
-
-        // Try on a range that starts slightly after our owned stripe
-        assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
-        );
-    }
-
-    /// Test our calculations work correctly when we start a range from the logical size key of
-    /// a previous relation.
-    #[test]
-    fn sharded_range_fragment_starting_from_logical_size() {
-        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
-
-        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
-        );
-
-        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
-        // store all logical sizes)
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
-        );
-    }
-
-    /// Test that ShardedRange behaves properly when used on un-sharded data
-    #[test]
-    fn sharded_range_fragment_unsharded() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (
-                0x10000,
-                vec![
-                    (0x8000, input_start..input_start.add(0x8000)),
-                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_cross_relation() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-
-        // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_tiny_nblocks() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
-        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16),
-            (
-                0x38,
-                vec![
-                    (16, input_start..input_start.add(16)),
-                    (16, input_start.add(16)..input_start.add(32)),
-                    (16, input_start.add(32)..input_start.add(48)),
-                    (8, input_start.add(48)..input_end),
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_fuzz() {
-        // Use a fixed seed: we don't want to explicitly pick values, but we do want
-        // the test to be reproducible.
-        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
-
-        for _i in 0..1000 {
-            let shard_identity = if prng.next_u32() % 2 == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                let shard_count = prng.next_u32() % 127 + 1;
-                ShardIdentity::new(
-                    ShardNumber((prng.next_u32() % shard_count) as u8),
-                    ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
-                )
-                .unwrap()
-            };
-
-            let target_nblocks = prng.next_u32() % 65536 + 1;
-
-            let start_offset = prng.next_u32() % 16384;
-
-            // Try ranges up to 4GiB in size, that are always at least 1
-            let range_size = prng.next_u32() % 8192 + 1;
-
-            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
-                .unwrap()
-                .add(start_offset);
-            let input_end = input_start.add(range_size);
-
-            // This test's main success conditions are the invariants baked into do_fragment
-            let (_total_size, fragments) =
-                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
-
-            // Pick a random key within the range and check it appears in the output
-            let example_key = input_start.add(prng.next_u32() % range_size);
-
-            // Panic on unwrap if it isn't found
-            let example_key_frag = fragments
-                .iter()
-                .find(|f| f.1.contains(&example_key))
-                .unwrap();
-
-            // Check that the fragment containing our random key has a nonzero size if
-            // that key is shard-local
-            let example_key_local = !shard_identity.is_key_disposable(&example_key);
-            if example_key_local {
-                assert!(example_key_frag.0 > 0);
-            }
-        }
-    }
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
+use const_format::formatcp;

 pub mod controller_api;
 pub mod key;
@@ -10,4 +11,7 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;

-pub mod config;
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,4 +1,3 @@
-pub mod detach_ancestor;
 pub mod partitioning;
 pub mod utilization;

@@ -9,7 +8,6 @@ use std::{
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    str::FromStr,
    time::{Duration, SystemTime},
 };

@@ -63,7 +61,7 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::VariantNames,
+    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
@@ -305,31 +303,6 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub enum AuxFilePolicy {
-    V1,
-    V2,
-    CrossValidation,
-}
-
-impl FromStr for AuxFilePolicy {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.to_lowercase();
-        if s == "v1" {
-            Ok(Self::V1)
-        } else if s == "v2" {
-            Ok(Self::V2)
-        } else if s == "crossvalidation" || s == "cross_validation" {
-            Ok(Self::CrossValidation)
-        } else {
-            anyhow::bail!("cannot parse {} to aux file policy", s)
-        }
-    }
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -456,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -806,17 +780,6 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantScanRemoteStorageShard {
-    pub tenant_shard_id: TenantShardId,
-    pub generation: Option<u32>,
-}
-
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct TenantScanRemoteStorageResponse {
-    pub shards: Vec<TenantScanRemoteStorageShard>,
-}
-
 pub mod virtual_file {
    #[derive(
        Copy,
@@ -884,72 +847,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// In the V2 protocol version, a GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
-// difference in the responses between V1 and V2.
-//
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
-#[derive(Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V1,
-    V2,
-}
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub dbnode: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub kind: u8,
    pub segno: u32,
 }
@@ -996,16 +926,14 @@ pub struct TenantHistorySize {
 }

 impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1014,8 +942,8 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1024,8 +952,8 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1035,15 +963,15 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
@@ -1052,40 +980,18 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
-
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
-
-        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1094,8 +1000,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1104,8 +1010,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1115,14 +1021,14 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
@@ -1250,8 +1156,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1260,8 +1166,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                latest: false,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1270,8 +1176,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1281,16 +1187,14 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/models/detach_ancestor.rs
+++ b/libs/pageserver_api/src/models/detach_ancestor.rs
@@ -1,6 +0,0 @@
-use utils::id::TimelineId;
-
-#[derive(Default, serde::Serialize)]
-pub struct AncestorDetached {
-    pub reparented_timelines: Vec<TimelineId>,
-}
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,11 +1,9 @@
 use utils::lsn::Lsn;

-use crate::keyspace::SparseKeySpace;
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-    pub sparse_keys: crate::keyspace::SparseKeySpace,
+
    pub at_lsn: Lsn,
 }

@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("sparse_keys")?;
-        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
-            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
-            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -139,12 +133,6 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
-            "sparse_keys": [
-              [
-                "620000000000000000000000000000000000",
-                "620000000000000000000000000000000003"
-              ]
-            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,7 +5,6 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
-use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

@@ -97,7 +96,7 @@ impl ShardCount {

    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
+    /// as `TenantShardId::unsharded`.
    ///
    /// This method returns the actual number of shards, i.e. if our internal value is
    /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,9 +115,7 @@ impl ShardCount {
        self.0
    }

-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
+    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -453,7 +450,7 @@ impl ShardIdentity {
    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
    /// tenants.  Modern single-shard tenants should not use this: they should
    /// have number=0 count=1.
-    pub const fn unsharded() -> Self {
+    pub fn unsharded() -> Self {
        Self {
            number: ShardNumber(0),
            count: ShardCount(0),
@@ -652,13 +649,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    //
-    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
-    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
-    // because they must be included in basebackups.
-    let is_initfork = key.field5 == INIT_FORKNUM;
-
-    !is_rel_block_key(key) || is_initfork
+    !is_rel_block_key(key)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bytes.workspace = true
+futures.workspace = true
 rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
@@ -22,4 +23,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,9 +118,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -331,10 +331,7 @@ impl CheckPoint {
    /// Returns 'true' if the XID was updated.
    pub fn update_next_xid(&mut self, xid: u32) -> bool {
        // nextXid should be greater than any XID in WAL, so increment provided XID and check for wraparround.
-        let mut new_xid = std::cmp::max(
-            xid.wrapping_add(1),
-            pg_constants::FIRST_NORMAL_TRANSACTION_ID,
-        );
+        let mut new_xid = std::cmp::max(xid.wrapping_add(1), pg_constants::FIRST_NORMAL_TRANSACTION_ID);
        // To reduce number of metadata checkpoints, we forward align XID on XID_CHECKPOINT_INTERVAL.
        // XID_CHECKPOINT_INTERVAL should not be larger than BLCKSZ*CLOG_XACTS_PER_BYTE
        new_xid =
@@ -370,16 +367,8 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
    let seg_off = lsn.segment_offset(WAL_SEGMENT_SIZE);

    let first_page_only = seg_off < XLOG_BLCKSZ;
-    // If first records starts in the middle of the page, pretend in page header
-    // there is a fake record which ends where first real record starts. This
-    // makes pg_waldump etc happy.
-    let (shdr_rem_len, infoflags) = if first_page_only && seg_off > 0 {
-        assert!(seg_off >= XLOG_SIZE_OF_XLOG_LONG_PHD);
-        // xlp_rem_len doesn't include page header, hence the subtraction.
-        (
-            seg_off - XLOG_SIZE_OF_XLOG_LONG_PHD,
-            pg_constants::XLP_FIRST_IS_CONTRECORD,
-        )
+    let (shdr_rem_len, infoflags) = if first_page_only {
+        (seg_off, pg_constants::XLP_FIRST_IS_CONTRECORD)
    } else {
        (0, 0)
    };
@@ -408,22 +397,20 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte

    if !first_page_only {
        let block_offset = lsn.page_offset_in_segment(WAL_SEGMENT_SIZE) as usize;
-        // see comments above about XLP_FIRST_IS_CONTRECORD and xlp_rem_len.
-        let (xlp_rem_len, xlp_info) = if page_off > 0 {
-            assert!(page_off >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
-            (
-                (page_off - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64) as u32,
-                pg_constants::XLP_FIRST_IS_CONTRECORD,
-            )
-        } else {
-            (0, 0)
-        };
        let header = XLogPageHeaderData {
            xlp_magic: XLOG_PAGE_MAGIC as u16,
-            xlp_info,
+            xlp_info: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                pg_constants::XLP_FIRST_IS_CONTRECORD
+            } else {
+                0
+            },
            xlp_tli: PG_TLI,
            xlp_pageaddr: lsn.page_lsn().0,
-            xlp_rem_len,
+            xlp_rem_len: if page_off >= pg_constants::SIZE_OF_PAGE_HEADER as u64 {
+                page_off as u32
+            } else {
+                0u32
+            },
            ..Default::default() // Put 0 in padding fields.
        };
        let hdr_bytes = header.encode()?;
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,9 +4,7 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -264,21 +262,11 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
+    // Some records may be not flushed, e.g. non-transactional logical messages.
    //
-    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
-    // returns the position just after the page header on the next page. That's where the next
-    // record will be inserted. But the page header hasn't actually been written to the WAL
-    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
-    // error. Because of that, if the insert location is just after a page header, back off to
-    // previous page boundary.
-    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
-    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-    }
-    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
+    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // because pg_current_wal_insert_lsn skips page headers.
+    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
    Ok(intermediate_lsns)
 }

@@ -332,49 +320,38 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
-        // will use carefully-sized logical messages to advance WAL insert location such
-        // that there is just enough space on the page for the XLOG_SWITCH record.
-        loop {
-            // We start with measuring how much WAL it takes for one logical message,
-            // considering all alignments and headers.
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // We will use logical message as the padding. We start with detecting how much WAL
+        // it takes for one logical message, considering all alignments and headers.
+        let base_wal_advance = {
            let before_lsn = client.pg_current_wal_insert_lsn()?;
+            // Small non-empty message bigger than few bytes is more likely than an empty
+            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            let after_lsn = client.pg_current_wal_insert_lsn()?;
-
-            // Did the record cross a page boundary? If it did, start over. Crossing a
-            // page boundary adds to the apparent size of the record because of the page
-            // header, which throws off the calculation.
-            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
-                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
-            {
-                continue;
-            }
-            // base_size is the size of a logical message without the payload
-            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
-
-            // Is there enough space on the page for another logical message and an
-            // XLOG_SWITCH? If not, start over.
-            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
-                continue;
-            }
-
-            // We will write another logical message, such that after the logical message
-            // record, there will be space for exactly one XLOG_SWITCH. How large should
-            // the logical message's payload be? An XLOG_SWITCH record has no data => its
-            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
-
-            client.execute(
-                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-                &[&(repeats as i32)],
-            )?;
-            break;
+            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+                + XLOG_SIZE_OF_XLOG_RECORD
+        };
+        let mut remaining_lsn =
+            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
+        if remaining_lsn < base_wal_advance {
+            remaining_lsn += XLOG_BLCKSZ;
        }
+        let repeats = 10 + remaining_lsn - base_wal_advance;
+        info!(
+            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
+            client.pg_current_wal_insert_lsn()?,
+            remaining_lsn,
+            base_wal_advance,
+            repeats
+        );
+        client.execute(
+            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+            &[&(repeats as i32)],
+        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -11,6 +11,7 @@ pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
 tokio.workspace = true
+tracing.workspace = true
 thiserror.workspace = true
 serde.workspace = true

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,7 +38,6 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
-sync_wrapper = { workspace = true, features = ["futures"] }

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,7 +3,6 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
-use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -21,7 +20,6 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
-use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -130,12 +128,12 @@ impl AzureBlobStorage {
        let kind = RequestKind::Get;

        let _permit = self.permit(kind, cancel).await?;
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());

        let mut etag = None;
        let mut last_modified = None;
        let mut metadata = HashMap::new();
+        // TODO give proper streaming response instead of buffering into RAM
+        // https://github.com/neondatabase/neon/issues/5563

        let download = async {
            let response = builder
@@ -154,46 +152,39 @@ impl AzureBlobStorage {
                Err(_elapsed) => Err(DownloadError::Timeout),
            });

-            let mut response = Box::pin(response);
+            let mut response = std::pin::pin!(response);

-            let Some(part) = response.next().await else {
+            let mut bufs = Vec::new();
+            while let Some(part) = response.next().await {
+                let part = part?;
+                if etag.is_none() {
+                    etag = Some(part.blob.properties.etag);
+                }
+                if last_modified.is_none() {
+                    last_modified = Some(part.blob.properties.last_modified.into());
+                }
+                if let Some(blob_meta) = part.blob.metadata {
+                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+                }
+                let data = part
+                    .data
+                    .collect()
+                    .await
+                    .map_err(|e| DownloadError::Other(e.into()))?;
+                bufs.push(data);
+            }
+
+            if bufs.is_empty() {
                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no response body"
+                    "Azure GET response contained no buffers"
                )));
-            };
-            let part = part?;
-            if etag.is_none() {
-                etag = Some(part.blob.properties.etag);
            }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
-
            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
            let etag = etag.unwrap();
            let last_modified = last_modified.unwrap();

-            let tail_stream = response
-                .map(|part| match part {
-                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
-                    Err(e) => {
-                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
-                    }
-                })
-                .flatten();
-            let stream = part
-                .data
-                .map(|r| r.map_err(io::Error::other))
-                .chain(sync_wrapper::SyncStream::new(tail_stream));
-            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
-
-            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
-
            Ok(Download {
-                download_stream: Box::pin(download_stream),
+                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
                last_modified,
                metadata: Some(StorageMetadata(metadata)),
@@ -202,10 +193,7 @@ impl AzureBlobStorage {

        tokio::select! {
            bufs = download => bufs,
-            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
-            },
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,13 +21,11 @@ use std::{
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
-    str::FromStr,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::{bail, Context};
-use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
@@ -55,11 +53,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// Set this limit analogously to the S3 limit
+/// We set this a little bit low as we currently buffer the entire file into RAM
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -136,11 +134,6 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
-
-    pub fn add_trailing_slash(&self) -> Self {
-        // Unwrap safety inputs are guararnteed to be valid UTF-8
-        Self(format!("{}/", self.0).try_into().unwrap())
-    }
 }

 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -164,21 +157,47 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
-    ///
-    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
-    /// from the absolute root of the bucket.
-    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
-    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
-    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
-    /// returned in `keys` ().
-    ///
-    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
-    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
-    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
+    /// Lists all top level subdirectories for a given prefix
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    /// so this method doesnt need to.
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
+    /// Lists all files in directory "recursively"
+    /// (not really recursively, because AWS has a flat namespace)
+    /// Note: This is subtely different than list_prefixes,
+    /// because it is for listing files instead of listing
+    /// names sharing common prefixes.
+    /// For example,
+    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
+    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
+    /// whereas,
+    /// list_prefixes("foo/bar/") = ["cat", "dog"]
+    /// See `test_real_s3.rs` for more details.
    ///
+    /// max_keys limits max number of keys returned; None means unlimited.
+    async fn list_files(
+        &self,
+        prefix: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
+            .await?
+            .keys;
+        Ok(result)
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -317,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    //
+    // max_keys limits max number of keys returned; None means unlimited.
+    pub async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
+    pub async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -565,7 +619,6 @@ pub struct S3Config {
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
    pub concurrency_limit: NonZeroUsize,
    pub max_keys_per_list_response: Option<i32>,
-    pub upload_storage_class: Option<StorageClass>,
 }

 impl Debug for S3Config {
@@ -694,18 +747,6 @@ impl RemoteStorageConfig {
                    endpoint,
                    concurrency_limit,
                    max_keys_per_list_response,
-                    upload_storage_class: toml
-                        .get("upload_storage_class")
-                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
-                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
-                            let storage_class = StorageClass::from_str(&s).expect("infallible");
-                            #[allow(deprecated)]
-                            if matches!(storage_class, StorageClass::Unknown(_)) {
-                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
-                            }
-                            Ok(storage_class)
-                        })
-                        .transpose()?,
                })
            }
            (_, _, _, Some(_), None) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,9 +5,11 @@
 //! volume is mounted to the local FS.

 use std::{
-    collections::HashSet,
+    borrow::Cow,
+    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
+    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };

@@ -20,11 +22,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use utils::crashsafe::path_with_suffix_extension;
+use tracing::*;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {

    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        use std::{future::Future, pin::Pin};
-        fn get_all_files<'a, P>(
-            directory_path: P,
-        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-        where
-            P: AsRef<Utf8Path> + Send + Sync + 'a,
-        {
-            Box::pin(async move {
-                let directory_path = directory_path.as_ref();
-                if directory_path.exists() {
-                    if directory_path.is_dir() {
-                        let mut paths = Vec::new();
-                        let mut dir_contents = fs::read_dir(directory_path).await?;
-                        while let Some(dir_entry) = dir_contents.next_entry().await? {
-                            let file_type = dir_entry.file_type().await?;
-                            let entry_path =
-                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                                    anyhow::Error::msg(format!(
-                                        "non-Unicode path: {}",
-                                        pb.to_string_lossy()
-                                    ))
-                                })?;
-                            if file_type.is_symlink() {
-                                tracing::debug!("{entry_path:?} is a symlink, skipping")
-                            } else if file_type.is_dir() {
-                                paths.extend(get_all_files(&entry_path).await?.into_iter())
-                            } else {
-                                paths.push(entry_path);
-                            }
-                        }
-                        Ok(paths)
-                    } else {
-                        bail!("Path {directory_path:?} is not a directory")
-                    }
-                } else {
-                    Ok(Vec::new())
-                }
-            })
-        }
-
-        Ok(get_all_files(&self.storage_root)
+        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
            .map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-
-        // If there's no trailing slash, we have to start looking from one above: even if
-        // `initial_dir` is a directory, we should still list any prefixes in the parent
-        // that start with the same string.
-        if !full_path.to_string().ends_with('/') {
-            initial_dir.pop();
-        }
-
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();

-            // Filter out directories: in S3 directories don't exist, only the keys within them do.
-            let keys = self
-                .list_recursive(prefix)
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
-            let keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();

-            if let ListingMode::NoDelimiter = mode {
-                result.keys = keys;
-            } else {
-                let mut prefixes = HashSet::new();
-                for key in keys {
-                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
-                    let relative_key = if let Some(prefix) = prefix {
-                        let mut prefix = prefix.clone();
-                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
-                        // end up with full file/dir names.
-                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
-                        let has_slash = prefix.0.to_string().ends_with('/');
-                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
-                            prefix
-                        } else {
-                            prefix.0.pop();
-                            prefix
-                        };
-
-                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
-                    } else {
-                        key
-                    };
-
-                    let relative_key = format!("{}", relative_key);
-                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                        let first_part = relative_key
-                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                            .next()
-                            .unwrap()
-                            .to_owned();
-                        prefixes.insert(first_part);
-                    } else {
-                        result
-                            .keys
-                            .push(RemotePath::from_string(&relative_key).unwrap());
-                    }
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
                }
-                result.prefixes = prefixes
-                    .into_iter()
-                    .map(|s| RemotePath::from_string(&s).unwrap())
-                    .collect();
            }

-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
            Ok(result)
        };

@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

+fn get_all_files<'a, P>(
+    directory_path: P,
+    recursive: bool,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
+                    if file_type.is_symlink() {
+                        debug!("{entry_path:?} is a symlink, skipping")
+                    } else if file_type.is_dir() {
+                        if recursive {
+                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
+                        } else {
+                            paths.push(entry_path)
+                        }
+                    } else {
+                        paths.push(entry_path);
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path {directory_path:?} is not a directory")
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -930,18 +923,13 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
-        let child_sibling =
-            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(
-            listing.keys.into_iter().collect::<HashSet<_>>(),
-            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
-        );
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
        let listing = storage
@@ -954,25 +942,7 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());

-        // Delimiter & prefix with a trailing slash
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(
-            listing.keys,
-            [RemotePath::from_string("uncle").unwrap()].to_vec()
-        );
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("parent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix without a trailing slash
+        // Delimiter & prefix
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -981,66 +951,12 @@ mod fs_tests {
                &cancel,
            )
            .await?;
-        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn list_part_component() -> anyhow::Result<()> {
-        // No delimiter: should recursively list everything
-        let (storage, cancel) = create_storage()?;
-
-        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
-        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
-        // a freeform prefix.
-        let _child_a =
-            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
-        let _child_b =
-            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(
-                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
-                ),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-
-        let mut found_prefixes = listing.prefixes.clone();
-        found_prefixes.sort();
-        assert_eq!(
-            found_prefixes,
-            [
-                RemotePath::from_string("tenant").unwrap(),
-                RemotePath::from_string("tenant-01").unwrap(),
-            ]
-            .to_vec()
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());

        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -27,10 +27,10 @@ use aws_config::{
 };
 use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
+    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    upload_storage_class: Option<StorageClass>,
    concurrency_limiter: ConcurrencyLimiter,
    // Per-request timeout. Accessible for tests.
    pub timeout: Duration,
@@ -75,13 +74,13 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
-            remote_storage_config.bucket_name
+            aws_config.bucket_name
        );

-        let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
+        let region = Some(Region::new(aws_config.bucket_region.clone()));

        let provider_conf = ProviderConfig::without_region().with_region(region.clone());

@@ -113,38 +112,6 @@ impl S3Bucket {
        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());

-        let sdk_config_loader: aws_config::ConfigLoader = aws_config::defaults(
-            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
-            BehaviorVersion::v2023_11_09(),
-        )
-        .region(region)
-        .identity_cache(IdentityCache::lazy().build())
-        .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl));
-
-        let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
-            s.spawn(|| {
-                // TODO: make this function async.
-                tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()
-                    .unwrap()
-                    .block_on(sdk_config_loader.load())
-            })
-            .join()
-            .unwrap()
-        });
-
-        let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
-
-        // Technically, the `remote_storage_config.endpoint` field only applies to S3 interactions.
-        // (In case we ever re-use the `sdk_config` for more than just the S3 client in the future)
-        if let Some(custom_endpoint) = remote_storage_config.endpoint.clone() {
-            s3_config_builder = s3_config_builder
-                .endpoint_url(custom_endpoint)
-                .force_path_style(true);
-        }
-
        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
@@ -152,36 +119,41 @@ impl S3Bucket {
        retry_config
            .set_max_attempts(Some(1))
            .set_mode(Some(RetryMode::Adaptive));
-        s3_config_builder = s3_config_builder.retry_config(retry_config.build());

-        let s3_config = s3_config_builder.build();
-        let client = aws_sdk_s3::Client::from_conf(s3_config);
+        let mut config_builder = Builder::default()
+            .behavior_version(BehaviorVersion::v2023_11_09())
+            .region(region)
+            .identity_cache(IdentityCache::lazy().build())
+            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+            .retry_config(retry_config.build())
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl));

-        let prefix_in_bucket = remote_storage_config
-            .prefix_in_bucket
-            .as_deref()
-            .map(|prefix| {
-                let mut prefix = prefix;
-                while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    prefix = &prefix[1..]
-                }
+        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
+            config_builder = config_builder
+                .endpoint_url(custom_endpoint)
+                .force_path_style(true);
+        }

-                let mut prefix = prefix.to_string();
-                while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                    prefix.pop();
-                }
-                prefix
-            });
+        let client = Client::from_conf(config_builder.build());

+        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
+            let mut prefix = prefix;
+            while prefix.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                prefix = &prefix[1..]
+            }
+
+            let mut prefix = prefix.to_string();
+            while prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                prefix.pop();
+            }
+            prefix
+        });
        Ok(Self {
            client,
-            bucket_name: remote_storage_config.bucket_name.clone(),
-            max_keys_per_list_response: remote_storage_config.max_keys_per_list_response,
+            bucket_name: aws_config.bucket_name.clone(),
+            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: ConcurrencyLimiter::new(
-                remote_storage_config.concurrency_limit.get(),
-            ),
-            upload_storage_class: remote_storage_config.upload_storage_class.clone(),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
            timeout,
        })
    }
@@ -206,7 +178,10 @@ impl S3Bucket {

    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path.get_path().as_str();
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -496,11 +471,16 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| {
-                self.prefix_in_bucket.clone().map(|mut s| {
-                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                    s
-                })
+            .or_else(|| self.prefix_in_bucket.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
            });

        let _permit = self.permit(kind, cancel).await?;
@@ -569,15 +549,11 @@ impl RemoteStorage for S3Bucket {
                }
            }

-            // S3 gives us prefixes like "foo/", we return them like "foo"
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                Some(
-                    self.s3_object_to_relative_path(
-                        o.prefix()?
-                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                    ),
-                )
-            }));
+            result.prefixes.extend(
+                prefixes
+                    .iter()
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
+            );

            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -610,7 +586,6 @@ impl RemoteStorage for S3Bucket {
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_metadata(metadata.map(|m| m.0))
-            .set_storage_class(self.upload_storage_class.clone())
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send();
@@ -662,7 +637,6 @@ impl RemoteStorage for S3Bucket {
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
-            .set_storage_class(self.upload_storage_class.clone())
            .copy_source(copy_source)
            .send();

@@ -920,7 +894,6 @@ impl RemoteStorage for S3Bucket {
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
-                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();

@@ -1077,22 +1050,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path/"],
-            vec!["/", "/some/path", "/some/path/"],
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
        ];

@@ -1104,7 +1077,6 @@ mod tests {
                endpoint: None,
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
-                upload_storage_class: None,
            };
            let storage =
                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_prefixes(prefix, cancel).await
+    }
+
+    async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_files(folder, max_keys, cancel).await
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
-use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes
+        .list_prefixes(None, &cancel)
+        .await
+        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list(
-            Some(&base_prefix.add_trailing_slash()),
-            ListingMode::WithDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .prefixes
+        .list_prefixes(Some(&base_prefix), &cancel)
+        .await
+        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_no_delimiter_works(
-    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list(None, ListingMode::NoDelimiter, None, &cancel)
+        .list_files(None, None, &cancel)
        .await
        .context("client list root files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list on root mismatches with the uploads."
+        "remote storage list_files on root mismatches with the uploads."
    );

    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list(
-            None,
-            ListingMode::NoDelimiter,
-            Some(NonZeroU32::new(2).unwrap()),
-            &cancel,
-        )
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.keys.len(), 2);
+    assert_eq!(limited_root_files.len(), 2);

    let nested_remote_files = test_client
-        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
+        .list_files(Some(&base_prefix), None, &cancel)
        .await
        .context("client list nested files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list on subdirrectory mismatches with the uploads."
+        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(

    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx
-        .client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;

    assert_eq!(prefixes.len(), 1);

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,6 +132,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
-    RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(
-            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
-                .await
-                .context("list root files failure")?
-                .keys
-                .into_iter()
-                .collect::<HashSet<_>>(),
-        )
+        Ok(retry(|| client.list_files(None, None, cancel))
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
    }

    let cancel = CancellationToken::new();
@@ -297,6 +294,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -380,7 +381,6 @@ fn create_s3_client(
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
-            upload_storage_class: None,
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 serde.workspace = true
+serde_with.workspace = true
 const_format.workspace = true
 utils.workspace = true

--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -8,8 +8,6 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 serde.workspace = true
+serde_json.workspace = true

 workspace_hack.workspace = true
-
-[dev-dependencies]
-serde_json.workspace = true
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -6,10 +6,10 @@ license.workspace = true

 [dependencies]
 hyper.workspace = true
-opentelemetry.workspace = true
-opentelemetry_sdk = { workspace = true, features=["rt-tokio"] }
+opentelemetry = { workspace = true, features=["rt-tokio"] }
 opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -35,10 +35,10 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

+use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
 use opentelemetry_otlp::WithExportConfig;
 use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
-use opentelemetry_sdk::Resource;

 pub use tracing_opentelemetry::OpenTelemetryLayer;

@@ -71,7 +71,7 @@ pub mod http;
 ///
 /// This doesn't block, but is marked as 'async' to hint that this must be called in
 /// asynchronous execution context.
-pub async fn init_tracing(service_name: &str) -> Option<opentelemetry_sdk::trace::Tracer> {
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -82,7 +82,7 @@ pub async fn init_tracing(service_name: &str) -> Option<opentelemetry_sdk::trace
 /// tasks.
 pub fn init_tracing_without_runtime(
    service_name: &str,
-) -> Option<opentelemetry_sdk::trace::Tracer> {
+) -> Option<opentelemetry::sdk::trace::Tracer> {
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -113,9 +113,9 @@ pub fn init_tracing_without_runtime(
    Some(init_tracing_internal(service_name.to_string()))
 }

-fn init_tracing_internal(service_name: String) -> opentelemetry_sdk::trace::Tracer {
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
    // Set up exporter from the OTEL_EXPORTER_* environment variables
-    let mut exporter = opentelemetry_otlp::new_exporter().http();
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();

    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
@@ -147,19 +147,19 @@ fn init_tracing_internal(service_name: String) -> opentelemetry_sdk::trace::Trac

    // Propagate trace information in the standard W3C TraceContext format.
    opentelemetry::global::set_text_map_propagator(
-        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
    );

    opentelemetry_otlp::new_pipeline()
        .tracing()
        .with_exporter(exporter)
        .with_trace_config(
-            opentelemetry_sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
                service_name,
            )])),
        )
-        .install_batch(opentelemetry_sdk::runtime::Tokio)
+        .install_batch(opentelemetry::runtime::Tokio)
        .expect("could not initialize opentelemetry exporter")
 }

--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -27,6 +27,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
@@ -43,6 +44,7 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
+serde_with.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,8 +34,6 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
-    pub const MAX: Self = Self::Valid(u32::MAX);
-
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -384,7 +384,7 @@ impl FromStr for NodeId {

 #[cfg(test)]
 mod tests {
-    use serde_assert::{Deserializer, Serializer, Token};
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    use crate::bin_ser::BeSer;

@@ -395,7 +395,7 @@ mod tests {
        let original_id = Id([
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ]);
-        let expected_tokens = vec![
+        let expected_tokens = Tokens(vec![
            Token::Tuple { len: 16 },
            Token::U8(173),
            Token::U8(80),
@@ -414,14 +414,15 @@ mod tests {
            Token::U8(228),
            Token::U8(24),
            Token::TupleEnd,
-        ];
+        ]);

        let serializer = Serializer::builder().is_human_readable(false).build();
        let serialized_tokens = original_id.serialize(&serializer).unwrap();
        assert_eq!(serialized_tokens, expected_tokens);

-        let mut deserializer = Deserializer::builder(serialized_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
+            .tokens(serialized_tokens)
            .build();
        let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
        assert_eq!(deserialized_id, original_id);
@@ -432,17 +433,20 @@ mod tests {
        let original_id = Id([
            173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
        ]);
-        let expected_tokens = vec![Token::Str(String::from("ad50847381e248feaac9876cc71ae418"))];
+        let expected_tokens = Tokens(vec![Token::Str(String::from(
+            "ad50847381e248feaac9876cc71ae418",
+        ))]);

        let serializer = Serializer::builder().is_human_readable(true).build();
        let serialized_tokens = original_id.serialize(&serializer).unwrap();
        assert_eq!(serialized_tokens, expected_tokens);

-        let mut deserializer = Deserializer::builder(vec![Token::Str(String::from(
-            "ad50847381e248feaac9876cc71ae418",
-        ))])
-        .is_human_readable(true)
-        .build();
+        let mut deserializer = Deserializer::builder()
+            .is_human_readable(true)
+            .tokens(Tokens(vec![Token::Str(String::from(
+                "ad50847381e248feaac9876cc71ae418",
+            ))]))
+            .build();
        assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
    }

--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -3,9 +3,9 @@ use std::str::FromStr;
 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
-use strum_macros::{EnumString, VariantNames};
+use strum_macros::{EnumString, EnumVariantNames};

-#[derive(EnumString, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
+#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
 #[strum(serialize_all = "snake_case")]
 pub enum LogFormat {
    Plain,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -415,7 +415,7 @@ mod tests {

    use super::*;

-    use serde_assert::{Deserializer, Serializer, Token};
+    use serde_assert::{Deserializer, Serializer, Token, Tokens};

    #[test]
    fn test_lsn_strings() {
@@ -496,16 +496,18 @@ mod tests {
    #[test]
    fn test_lsn_serde() {
        let original_lsn = Lsn(0x0123456789abcdef);
-        let expected_readable_tokens = vec![Token::U64(0x0123456789abcdef)];
-        let expected_non_readable_tokens = vec![Token::Str(String::from("1234567/89ABCDEF"))];
+        let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
+        let expected_non_readable_tokens =
+            Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);

        // Testing human_readable ser/de
        let serializer = Serializer::builder().is_human_readable(false).build();
        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        assert_eq!(readable_ser_tokens, expected_readable_tokens);

-        let mut deserializer = Deserializer::builder(readable_ser_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
            .build();
        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
        assert_eq!(des_lsn, original_lsn);
@@ -515,8 +517,9 @@ mod tests {
        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
        assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);

-        let mut deserializer = Deserializer::builder(non_readable_ser_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
            .build();
        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
        assert_eq!(des_lsn, original_lsn);
@@ -525,16 +528,18 @@ mod tests {
        let serializer = Serializer::builder().is_human_readable(false).build();
        let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();

-        let mut deserializer = Deserializer::builder(non_readable_ser_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(true)
+            .tokens(non_readable_ser_tokens)
            .build();
        Lsn::deserialize(&mut deserializer).unwrap_err();

        let serializer = Serializer::builder().is_human_readable(true).build();
        let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();

-        let mut deserializer = Deserializer::builder(readable_ser_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
+            .tokens(readable_ser_tokens)
            .build();
        Lsn::deserialize(&mut deserializer).unwrap_err();
    }
@@ -546,8 +551,9 @@ mod tests {
        let serializer = Serializer::builder().is_human_readable(false).build();
        let ser_tokens = original_lsn.serialize(&serializer).unwrap();

-        let mut deserializer = Deserializer::builder(ser_tokens)
+        let mut deserializer = Deserializer::builder()
            .is_human_readable(false)
+            .tokens(ser_tokens)
            .build();

        let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,10 +2,11 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
+use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{self, channel};
+use tokio::sync::watch::{channel, Receiver, Sender};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Heap of waiters, lowest numbers pop first.
-struct Waiters<V>
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
 where
+    S: MonotonicCounter<V>,
    V: Ord,
 {
-    heap: BinaryHeap<Waiter<V>>,
-    /// Number of the first waiter in the heap, or None if there are no waiters.
-    status_channel: watch::Sender<Option<V>>,
-}
-
-impl<V> Waiters<V>
-where
-    V: Ord + Copy,
-{
-    fn new() -> Self {
-        Waiters {
-            heap: BinaryHeap::new(),
-            status_channel: channel(None).0,
-        }
-    }
-
-    /// `status_channel` contains the number of the first waiter in the heap.
-    /// This function should be called whenever waiters heap changes.
-    fn update_status(&self) {
-        let first_waiter = self.heap.peek().map(|w| w.wake_num);
-        let _ = self.status_channel.send_replace(first_waiter);
-    }
-
-    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
-    fn add(&mut self, num: V) -> watch::Receiver<()> {
-        let (tx, rx) = channel(());
-        self.heap.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
-        self.update_status();
-        rx
-    }
-
-    /// Pop all waiters <= num from the heap. Collect channels in a vector,
-    /// so that caller can wake them up.
-    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
-        let mut wake_these = Vec::new();
-        while let Some(n) = self.heap.peek() {
-            if n.wake_num > num {
-                break;
-            }
-            wake_these.push(self.heap.pop().unwrap().wake_channel);
-        }
-        self.update_status();
-        wake_these
-    }
-
-    /// Used on shutdown to efficiently drop all waiters.
-    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
-        let heap = mem::take(&mut self.heap);
-        self.update_status();
-        heap
-    }
+    waiters: BinaryHeap<Waiter<V>>,
+    current: S,
+    shutdown: bool,
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
+    wake_num: T,              // wake me when this number arrives ...
+    wake_channel: Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
-where
-    S: MonotonicCounter<V>,
-    V: Ord,
-{
-    waiters: Waiters<V>,
-    current: S,
-    shutdown: bool,
-}
-
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: Waiters::new(),
+            waiters: BinaryHeap::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -188,8 +128,9 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // Take all waiters to drop them later.
-            internal.waiters.take_all()
+            // This will steal the entire waiters map.
+            // When we drop it all waiters will be woken.
+            mem::take(&mut internal.waiters)

            // Drop the lock as we exit this scope.
        };
@@ -255,7 +196,7 @@ where

    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -264,8 +205,12 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Add waiter channel to the queue.
-        let rx = internal.waiters.add(num);
+        // Create a new channel.
+        let (tx, rx) = channel(());
+        internal.waiters.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -286,8 +231,16 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap.
-            internal.waiters.pop_leq(num)
+            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // wake them up after releasing the lock.
+            let mut wake_these = Vec::new();
+            while let Some(n) = internal.waiters.peek() {
+                if n.wake_num > num {
+                    break;
+                }
+                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
+            }
+            wake_these
        };

        for tx in wake_these {
@@ -302,23 +255,6 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
-
-    /// Get a Receiver for the current status.
-    ///
-    /// The current status is the number of the first waiter in the queue,
-    /// or None if there are no waiters.
-    ///
-    /// This receiver will be notified whenever the status changes.
-    /// It is useful for receiving notifications when the first waiter
-    /// starts waiting for a number, or when there are no more waiters left.
-    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
-        self.internal
-            .lock()
-            .unwrap()
-            .waiters
-            .status_channel
-            .subscribe()
-    }
 }

 #[cfg(test)]
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -50,14 +50,6 @@ extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
    }
 }

-extern "C" fn update_donor(wp: *mut WalProposer, donor: *mut Safekeeper, donor_lsn: XLogRecPtr) {
-    unsafe {
-        let callback_data = (*(*wp).config).callback_data;
-        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).update_donor(&mut (*donor), donor_lsn)
-    }
-}
-
 extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
@@ -399,7 +391,6 @@ pub(crate) fn create_api() -> walproposer_api {
        get_shmem_state: Some(get_shmem_state),
        start_streaming: Some(start_streaming),
        get_flush_rec_ptr: Some(get_flush_rec_ptr),
-        update_donor: Some(update_donor),
        get_current_timestamp: Some(get_current_timestamp),
        conn_error_message: Some(conn_error_message),
        conn_status: Some(conn_status),
@@ -430,32 +421,6 @@ pub(crate) fn create_api() -> walproposer_api {
    }
 }

-pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
-    let empty_feedback = crate::bindings::PageserverFeedback {
-        present: false,
-        currentClusterSize: 0,
-        last_received_lsn: 0,
-        disk_consistent_lsn: 0,
-        remote_consistent_lsn: 0,
-        replytime: 0,
-        shard_number: 0,
-    };
-
-    crate::bindings::WalproposerShmemState {
-        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
-        donor_name: [0; 64],
-        donor_conninfo: [0; 1024],
-        donor_lsn: 0,
-        mutex: 0,
-        mineLastElectedTerm: crate::bindings::pg_atomic_uint64 { value: 0 },
-        backpressureThrottlingTime: crate::bindings::pg_atomic_uint64 { value: 0 },
-        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
-        shard_ps_feedback: [empty_feedback; 128],
-        num_shards: 0,
-        min_ps_feedback: empty_feedback,
-    }
-}
-
 impl std::fmt::Display for Level {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        write!(f, "{:?}", self)
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,5 +1,8 @@
 use std::ffi::CString;

+use postgres_ffi::WAL_SEGMENT_SIZE;
+use utils::{id::TenantTimelineId, lsn::Lsn};
+
 use crate::{
    api_bindings::{create_api, take_vec_u8, Level},
    bindings::{
@@ -7,8 +10,6 @@ use crate::{
        WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
    },
 };
-use postgres_ffi::WAL_SEGMENT_SIZE;
-use utils::{id::TenantTimelineId, lsn::Lsn};

 /// Rust high-level wrapper for C walproposer API. Many methods are not required
 /// for simple cases, hence todo!() in default implementations.
@@ -27,10 +28,6 @@ pub trait ApiImpl {
        todo!()
    }

-    fn update_donor(&self, _donor: &mut Safekeeper, _donor_lsn: u64) {
-        todo!()
-    }
-
    fn get_current_timestamp(&self) -> i64 {
        todo!()
    }
@@ -277,7 +274,6 @@ mod tests {
        sync::{atomic::AtomicUsize, mpsc::sync_channel},
    };

-    use std::cell::UnsafeCell;
    use utils::id::TenantTimelineId;

    use crate::{api_bindings::Level, bindings::NeonWALReadResult, walproposer::Wrapper};
@@ -301,8 +297,6 @@ mod tests {
        replies_ptr: AtomicUsize,
        // channel to send LSN to the main thread
        sync_channel: std::sync::mpsc::SyncSender<u64>,
-        // Shmem state, used for storing donor info
-        shmem: UnsafeCell<crate::bindings::WalproposerShmemState>,
    }

    impl MockImpl {
@@ -333,22 +327,11 @@ mod tests {
    }

    impl ApiImpl for MockImpl {
-        fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
-            self.shmem.get()
-        }
-
        fn get_current_timestamp(&self) -> i64 {
            println!("get_current_timestamp");
            0
        }

-        fn update_donor(&self, donor: &mut crate::bindings::Safekeeper, donor_lsn: u64) {
-            let mut shmem = unsafe { *self.get_shmem_state() };
-            shmem.propEpochStartLsn.value = donor_lsn;
-            shmem.donor_conninfo = donor.conninfo;
-            shmem.donor_lsn = donor_lsn;
-        }
-
        fn conn_status(
            &self,
            _: &mut crate::bindings::Safekeeper,
@@ -524,7 +507,6 @@ mod tests {
            ],
            replies_ptr: AtomicUsize::new(0),
            sync_channel: sender,
-            shmem: UnsafeCell::new(crate::api_bindings::empty_shmem()),
        });
        let config = crate::walproposer::Config {
            ttid,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,7 +70,6 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
-twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,7 +1,7 @@
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
@@ -28,7 +28,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap {
    let mut updates = layer_map.batch_update();
    for fname in filenames {
        let fname = fname.unwrap();
-        let fname = LayerName::from_str(&fname).unwrap();
+        let fname = LayerFileName::from_str(&fname).unwrap();
        let layer = PersistentLayerDesc::from(fname);

        let lsn_range = layer.get_lsn_range();
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -243,19 +243,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_scan_remote_storage(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantScanRemoteStorageResponse> {
-        let uri = format!(
-            "{}/v1/tenant/{tenant_id}/scan_remote_storage",
-            self.mgmt_api_endpoint
-        );
-        let response = self.request(Method::GET, &uri, ()).await?;
-        let body = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok(body)
-    }
-
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
@@ -284,34 +271,6 @@ impl Client {
        Ok((status, progress))
    }

-    pub async fn tenant_secondary_status(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<SecondaryProgress> {
-        let path = reqwest::Url::parse(&format!(
-            "{}/v1/tenant/{}/secondary/status",
-            self.mgmt_api_endpoint, tenant_shard_id
-        ))
-        .expect("Cannot build URL");
-
-        self.request(Method::GET, path, ())
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    pub async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
-        let path = reqwest::Url::parse(&format!(
-            "{}/v1/tenant/{}/heatmap_upload",
-            self.mgmt_api_endpoint, tenant_id
-        ))
-        .expect("Cannot build URL");
-
-        self.request(Method::POST, path, ()).await?;
-        Ok(())
-    }
-
    pub async fn location_config(
        &self,
        tenant_shard_id: TenantShardId,
@@ -319,7 +278,10 @@ impl Client {
        flush_ms: Option<std::time::Duration>,
        lazy: bool,
    ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest { config };
+        let req_body = TenantLocationConfigRequest {
+            tenant_id: Some(tenant_shard_id),
+            config,
+        };

        let mut path = reqwest::Url::parse(&format!(
            "{}/v1/tenant/{}/location_config",
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -60,7 +60,7 @@ impl Client {
    ) -> anyhow::Result<PagestreamClient> {
        let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
            .client
-            .copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
+            .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
            .await?;
        let Client {
            cancel_on_client_drop,
--- a/pageserver/compaction/Cargo.toml
+++ b/pageserver/compaction/Cargo.toml
@@ -16,6 +16,7 @@ bytes.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
 const_format.workspace = true
+consumption_metrics.workspace = true
 crossbeam-utils.workspace = true
 either.workspace = true
 flate2.workspace = true
@@ -32,8 +33,10 @@ pin-project-lite.workspace = true
 rand.workspace = true
 smallvec = { workspace = true, features = ["write"] }
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-io-timeout.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
--- a/pageserver/compaction/src/compact_tiered.rs
+++ b/pageserver/compaction/src/compact_tiered.rs
@@ -18,7 +18,6 @@
 //! database size. For example, if the logical database size is 10 GB, we would
 //! generate new image layers every 10 GB of WAL.
 use futures::StreamExt;
-use pageserver_api::shard::ShardIdentity;
 use tracing::{debug, info};

 use std::collections::{HashSet, VecDeque};
@@ -126,7 +125,6 @@ async fn compact_level<E: CompactionJobExecutor>(
    }

    let mut state = LevelCompactionState {
-        shard_identity: *executor.get_shard_identity(),
        target_file_size,
        _lsn_range: lsn_range.clone(),
        layers: layer_fragments,
@@ -166,8 +164,6 @@ struct LevelCompactionState<'a, E>
 where
    E: CompactionJobExecutor,
 {
-    shard_identity: ShardIdentity,
-
    // parameters
    target_file_size: u64,

@@ -370,7 +366,6 @@ where
                .executor
                .get_keyspace(&job.key_range, job.lsn_range.end, ctx)
                .await?,
-            &self.shard_identity,
        ) * 8192;

        let wal_size = job
@@ -435,7 +430,7 @@ where
            keyspace,
            self.target_file_size / 8192,
        );
-        while let Some(key_range) = window.choose_next_image(&self.shard_identity) {
+        while let Some(key_range) = window.choose_next_image() {
            new_jobs.push(CompactionJob::<E> {
                key_range,
                lsn_range: job.lsn_range.clone(),
@@ -628,12 +623,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
    }

    // Advance the cursor until it reaches 'target_keysize'.
-    fn advance_until_size(
-        &mut self,
-        w: &KeyspaceWindowHead<K>,
-        max_size: u64,
-        shard_identity: &ShardIdentity,
-    ) {
+    fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
        while self.accum_keysize < max_size && !self.reached_end(w) {
            let curr_range = &w.keyspace[self.keyspace_idx];
            if self.end_key < curr_range.start {
@@ -642,7 +632,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            }

            // We're now within 'curr_range'. Can we advance past it completely?
-            let distance = K::key_range_size(&(self.end_key..curr_range.end), shard_identity);
+            let distance = K::key_range_size(&(self.end_key..curr_range.end));
            if (self.accum_keysize + distance as u64) < max_size {
                // oh yeah, it fits
                self.end_key = curr_range.end;
@@ -651,7 +641,7 @@ impl<K: CompactionKey> KeyspaceWindowPos<K> {
            } else {
                // advance within the range
                let skip_key = self.end_key.skip_some();
-                let distance = K::key_range_size(&(self.end_key..skip_key), shard_identity);
+                let distance = K::key_range_size(&(self.end_key..skip_key));
                if (self.accum_keysize + distance as u64) < max_size {
                    self.end_key = skip_key;
                    self.accum_keysize += distance as u64;
@@ -687,7 +677,7 @@ where
        }
    }

-    fn choose_next_image(&mut self, shard_identity: &ShardIdentity) -> Option<Range<K>> {
+    fn choose_next_image(&mut self) -> Option<Range<K>> {
        if self.start_pos.keyspace_idx == self.head.keyspace.len() {
            // we've reached the end
            return None;
@@ -697,7 +687,6 @@ where
        next_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + self.head.target_keysize,
-            shard_identity,
        );

        // See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
@@ -706,7 +695,6 @@ where
        end_pos.advance_until_size(
            &self.head,
            self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
-            shard_identity,
        );
        if end_pos.reached_end(&self.head) {
            // gobble up any unused keyspace between the last used key and end of the range
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -5,27 +5,19 @@ use crate::interface::*;
 use futures::future::BoxFuture;
 use futures::{Stream, StreamExt};
 use itertools::Itertools;
-use pageserver_api::shard::ShardIdentity;
 use pin_project_lite::pin_project;
 use std::collections::BinaryHeap;
 use std::collections::VecDeque;
-use std::fmt::Display;
 use std::future::Future;
 use std::ops::{DerefMut, Range};
 use std::pin::Pin;
 use std::task::{ready, Poll};

-pub fn keyspace_total_size<K>(
-    keyspace: &CompactionKeySpace<K>,
-    shard_identity: &ShardIdentity,
-) -> u64
+pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
 where
    K: CompactionKey,
 {
-    keyspace
-        .iter()
-        .map(|r| K::key_range_size(r, shard_identity) as u64)
-        .sum()
+    keyspace.iter().map(|r| K::key_range_size(r) as u64).sum()
 }

 pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
@@ -215,7 +207,7 @@ pub struct KeySize<K> {

 pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
 where
-    K: Eq + PartialOrd + Display + Copy,
+    K: Eq,
    I: Stream<Item = Result<D, E>>,
    D: CompactionDeltaEntry<'a, K>,
 {
@@ -230,15 +222,12 @@ where
                num_values: 1,
                size: first.size(),
            };
-            let mut last_key = accum.key;
            while let Some(this) = input.next().await {
                let this = this?;
                if this.key() == accum.key {
                    accum.size += this.size();
                    accum.num_values += 1;
                } else {
-                    assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
-                    last_key = accum.key;
                    yield accum;
                    accum = KeySize {
                        key: this.key(),
@@ -247,7 +236,6 @@ where
                    };
                }
            }
-            assert!(last_key <= accum.key, "last_key={last_key} <= accum.key={}", accum.key);
            yield accum;
        }
    }
--- a/pageserver/compaction/src/interface.rs
+++ b/pageserver/compaction/src/interface.rs
@@ -4,7 +4,7 @@
 //! All the heavy lifting is done by the create_image and create_delta
 //! functions that the implementor provides.
 use futures::Future;
-use pageserver_api::{key::Key, keyspace::ShardedRange, shard::ShardIdentity};
+use pageserver_api::{key::Key, keyspace::key_range_size};
 use std::ops::Range;
 use utils::lsn::Lsn;

@@ -32,8 +32,6 @@ pub trait CompactionJobExecutor {
    // Functions that the planner uses to support its decisions
    // ----

-    fn get_shard_identity(&self) -> &ShardIdentity;
-
    /// Return all layers that overlap the given bounding box.
    fn get_layers(
        &mut self,
@@ -100,7 +98,7 @@ pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
    ///
    /// This returns u32, for compatibility with Repository::key. If the
    /// distance is larger, return u32::MAX.
-    fn key_range_size(key_range: &Range<Self>, shard_identity: &ShardIdentity) -> u32;
+    fn key_range_size(key_range: &Range<Self>) -> u32;

    // return "self + 1"
    fn next(&self) -> Self;
@@ -115,8 +113,8 @@ impl CompactionKey for Key {
    const MIN: Self = Self::MIN;
    const MAX: Self = Self::MAX;

-    fn key_range_size(r: &std::ops::Range<Self>, shard_identity: &ShardIdentity) -> u32 {
-        ShardedRange::new(r.clone(), shard_identity).page_count()
+    fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
+        key_range_size(r)
    }
    fn next(&self) -> Key {
        (self as &Key).next()
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -3,7 +3,6 @@ mod draw;
 use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};

 use futures::StreamExt;
-use pageserver_api::shard::ShardIdentity;
 use rand::Rng;
 use tracing::info;

@@ -72,7 +71,7 @@ impl interface::CompactionKey for Key {
    const MIN: Self = u64::MIN;
    const MAX: Self = u64::MAX;

-    fn key_range_size(key_range: &Range<Self>, _shard_identity: &ShardIdentity) -> u32 {
+    fn key_range_size(key_range: &Range<Self>) -> u32 {
        std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
    }

@@ -435,11 +434,6 @@ impl interface::CompactionJobExecutor for MockTimeline {
    type ImageLayer = Arc<MockImageLayer>;
    type RequestContext = MockRequestContext;

-    fn get_shard_identity(&self) -> &ShardIdentity {
-        static IDENTITY: ShardIdentity = ShardIdentity::unsharded();
-        &IDENTITY
-    }
-
    async fn get_layers(
        &mut self,
        key_range: &Range<Self::Key>,
--- a/pageserver/compaction/tests/tests.rs
+++ b/pageserver/compaction/tests/tests.rs
@@ -1,20 +1,5 @@
-use once_cell::sync::OnceCell;
 use pageserver_compaction::interface::CompactionLayer;
 use pageserver_compaction::simulator::MockTimeline;
-use utils::logging;
-
-static LOG_HANDLE: OnceCell<()> = OnceCell::new();
-
-pub(crate) fn setup_logging() {
-    LOG_HANDLE.get_or_init(|| {
-        logging::init(
-            logging::LogFormat::Test,
-            logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-            logging::Output::Stdout,
-        )
-        .expect("Failed to init test logging")
-    });
-}

 /// Test the extreme case that there are so many updates for a single key that
 /// even if we produce an extremely narrow delta layer, spanning just that one
@@ -26,14 +11,13 @@ pub(crate) fn setup_logging() {
 #[ignore]
 #[tokio::test]
 async fn test_many_updates_for_single_key() {
-    setup_logging();
    let mut executor = MockTimeline::new();
-    executor.target_file_size = 1_000_000; // 1 MB
+    executor.target_file_size = 10_000_000; // 10 MB

-    // Ingest 10 MB of updates to a single key.
+    // Ingest 100 MB of updates to a single key.
    for _ in 1..1000 {
        executor.ingest_uniform(100, 10, &(0..100_000)).unwrap();
-        executor.ingest_uniform(1000, 10, &(0..1)).unwrap();
+        executor.ingest_uniform(10_000, 10, &(0..1)).unwrap();
        executor.compact().await.unwrap();
    }

@@ -49,26 +33,3 @@ async fn test_many_updates_for_single_key() {
        }
    }
 }
-
-#[tokio::test]
-async fn test_simple_updates() {
-    setup_logging();
-    let mut executor = MockTimeline::new();
-    executor.target_file_size = 500_000; // 500 KB
-
-    // Ingest some traffic.
-    for _ in 1..400 {
-        executor.ingest_uniform(100, 500, &(0..100_000)).unwrap();
-    }
-
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-
-    println!("Running compaction...");
-    executor.compact().await.unwrap();
-
-    for l in executor.live_layers.iter() {
-        println!("layer {}: {}", l.short_id(), l.file_size());
-    }
-}
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,45 +9,18 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! The plain text API was chosen so that we can easily work with filenames from various
-//! sources; see the Usage section below for examples.
-//!
-//! # Usage
-//!
-//! ## Producing the SVG
-//!
+//! Example use:
 //! ```bash
-//!
-//! # local timeline dir
-//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//!
-//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
-//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
-//!
-//! # From an `index_part.json` in S3
-//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
-//!
+//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//! $ firefox out.svg
 //! ```
 //!
-//! ## Viewing
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
 //!
-//! **Inkscape** is better than the built-in viewers in browsers.
-//!
-//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
-//! to see the layer file name in the comment field.
-//!
-//! ```bash
-//!
-//! # Linux
-//! inkscape out.svg
-//!
-//! # macOS
-//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
-//!
-//! ```
-//!
-
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -92,12 +65,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {

 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    struct Layer {
-        filename: String,
-        key_range: Range<Key>,
-        lsn_range: Range<Lsn>,
-    }
-    let mut files: Vec<Layer> = vec![];
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -108,23 +76,14 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let (key_range, lsn_range) = parse_filename(filename);
-        files.push(Layer {
-            filename: filename.to_owned(),
-            key_range,
-            lsn_range,
-        });
+        let range = parse_filename(filename);
+        ranges.push(range);
    }

    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for Layer {
-        key_range: keyr,
-        lsn_range: lsnr,
-        ..
-    } in &files
-    {
+    for (keyr, lsnr) in &ranges {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -148,12 +107,7 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for Layer {
-        filename,
-        key_range: keyr,
-        lsn_range: lsnr,
-    } in &files
-    {
+    for (keyr, lsnr) in &ranges {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -197,7 +151,6 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
-            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -3,7 +3,7 @@ use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
-use pageserver::tenant::storage_layer::LayerName;
+use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;

@@ -19,7 +19,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, IndexLayerMetadata>,
+                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -312,12 +312,8 @@ async fn main_impl(
                    let (rel_tag, block_no) =
                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
                    PagestreamGetPageRequest {
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
-                        },
-                        not_modified_since: r.timeline_lsn,
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
                        rel: rel_tag,
                        blkno: block_no,
                    }
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -1,208 +0,0 @@
-use bytes::{Buf, BufMut, Bytes};
-use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
-use tracing::warn;
-
-/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
-fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
-    let mut key = [0; METADATA_KEY_SIZE];
-    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
-    key[0] = AUX_KEY_PREFIX;
-    key[1] = dir_level1;
-    key[2] = dir_level2;
-    key[3..16].copy_from_slice(&hash[0..13]);
-    Key::from_metadata_key_fixed_size(&key)
-}
-
-const AUX_DIR_PG_LOGICAL: u8 = 0x01;
-const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
-const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
-
-/// Encode the aux file into a fixed-size key.
-///
-/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
-/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
-/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
-/// is roughly based on the first two components of the path, one unique number for one component.
-///
-/// * pg_logical/mappings -> 0x0101
-/// * pg_logical/snapshots -> 0x0102
-/// * pg_logical/replorigin_checkpoint -> 0x0103
-/// * pg_logical/others -> 0x01FF
-/// * pg_replslot/ -> 0x0201
-/// * others -> 0xFFFF
-///
-/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
-/// The new file type must have never been written to the storage before. Otherwise, there could be data
-/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
-pub fn encode_aux_file_key(path: &str) -> Key {
-    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
-    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
-    } else if path == "pg_logical/replorigin_checkpoint" {
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
-    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
-        if cfg!(debug_assertions) {
-            warn!(
-                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
-                path
-            );
-        }
-        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
-    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
-        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
-    } else {
-        if cfg!(debug_assertions) {
-            warn!(
-                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
-                path
-            );
-        }
-        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
-    }
-}
-
-const AUX_FILE_ENCODING_VERSION: u8 = 0x01;
-
-pub fn decode_file_value(val: &[u8]) -> anyhow::Result<Vec<(&str, &[u8])>> {
-    let mut ptr = val;
-    if ptr.is_empty() {
-        // empty value = no files
-        return Ok(Vec::new());
-    }
-    assert_eq!(
-        ptr.get_u8(),
-        AUX_FILE_ENCODING_VERSION,
-        "unsupported aux file value"
-    );
-    let mut files = vec![];
-    while ptr.has_remaining() {
-        let key_len = ptr.get_u32() as usize;
-        let key = &ptr[..key_len];
-        ptr.advance(key_len);
-        let val_len = ptr.get_u32() as usize;
-        let content = &ptr[..val_len];
-        ptr.advance(val_len);
-
-        let path = std::str::from_utf8(key)?;
-        files.push((path, content));
-    }
-    Ok(files)
-}
-
-/// Decode an aux file key-value pair into a list of files. The returned `Bytes` contains reference
-/// to the original value slice. Be cautious about memory consumption.
-pub fn decode_file_value_bytes(val: &Bytes) -> anyhow::Result<Vec<(String, Bytes)>> {
-    let mut ptr = val.clone();
-    if ptr.is_empty() {
-        // empty value = no files
-        return Ok(Vec::new());
-    }
-    assert_eq!(
-        ptr.get_u8(),
-        AUX_FILE_ENCODING_VERSION,
-        "unsupported aux file value"
-    );
-    let mut files = vec![];
-    while ptr.has_remaining() {
-        let key_len = ptr.get_u32() as usize;
-        let key = ptr.slice(..key_len);
-        ptr.advance(key_len);
-        let val_len = ptr.get_u32() as usize;
-        let content = ptr.slice(..val_len);
-        ptr.advance(val_len);
-
-        let path = std::str::from_utf8(&key)?.to_string();
-        files.push((path, content));
-    }
-    Ok(files)
-}
-
-pub fn encode_file_value(files: &[(&str, &[u8])]) -> anyhow::Result<Vec<u8>> {
-    if files.is_empty() {
-        // no files = empty value
-        return Ok(Vec::new());
-    }
-    let mut encoded = vec![];
-    encoded.put_u8(AUX_FILE_ENCODING_VERSION);
-    for (path, content) in files {
-        if path.len() > u32::MAX as usize {
-            anyhow::bail!("{} exceeds path size limit", path);
-        }
-        encoded.put_u32(path.len() as u32);
-        encoded.put_slice(path.as_bytes());
-        if content.len() > u32::MAX as usize {
-            anyhow::bail!("{} exceeds content size limit", path);
-        }
-        encoded.put_u32(content.len() as u32);
-        encoded.put_slice(content);
-    }
-    Ok(encoded)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_hash_portable() {
-        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
-        // if the algorithm produces the same hash across different environments.
-        assert_eq!(
-            305317690835051308206966631765527126151,
-            twox_hash::xxh3::hash128("test1".as_bytes())
-        );
-        assert_eq!(
-            85104974691013376326742244813280798847,
-            twox_hash::xxh3::hash128("test/test2".as_bytes())
-        );
-        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
-    }
-
-    #[test]
-    fn test_encoding_portable() {
-        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
-        // of the page server.
-        assert_eq!(
-            "6200000101E5B20C5F8DD5AA3289D6D9EAFA",
-            encode_aux_file_key("pg_logical/mappings/test1").to_string()
-        );
-        assert_eq!(
-            "620000010239AAC544893139B26F501B97E6",
-            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
-        );
-        assert_eq!(
-            "620000010300000000000000000000000000",
-            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
-        );
-        assert_eq!(
-            "62000001FF8635AF2134B7266EC5B4189FD6",
-            encode_aux_file_key("pg_logical/unsupported").to_string()
-        );
-        assert_eq!(
-            "6200000201772D0E5D71DE14DA86142A1619",
-            encode_aux_file_key("pg_replslot/test3").to_string()
-        );
-        assert_eq!(
-            "620000FFFF1866EBEB53B807B26A2416F317",
-            encode_aux_file_key("other_file_not_supported").to_string()
-        );
-    }
-
-    #[test]
-    fn test_value_encoding() {
-        let files = vec![
-            ("pg_logical/1.file", "1111".as_bytes()),
-            ("pg_logical/2.file", "2222".as_bytes()),
-        ];
-        assert_eq!(
-            files,
-            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
-        );
-        let files = vec![];
-        assert_eq!(
-            files,
-            decode_file_value(&encode_file_value(&files).unwrap()).unwrap()
-        );
-    }
-}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{anyhow, Context};
+use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{key_to_slru_block, Key};
@@ -38,14 +38,6 @@ use postgres_ffi::PG_TLI;
 use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

-#[derive(Debug, thiserror::Error)]
-pub enum BasebackupError {
-    #[error("basebackup pageserver error {0:#}")]
-    Server(#[from] anyhow::Error),
-    #[error("basebackup client error {0:#}")]
-    Client(#[source] io::Error),
-}
-
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -61,7 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    ctx: &'a RequestContext,
-) -> Result<(), BasebackupError>
+) -> anyhow::Result<()>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
@@ -100,10 +92,8 @@ where

    // Consolidate the derived and the provided prev_lsn values
    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
-        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
-            return Err(BasebackupError::Server(anyhow!(
-                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
-            )));
+        if backup_prev != Lsn(0) {
+            ensure!(backup_prev == provided_prev_lsn);
        }
        provided_prev_lsn
    } else {
@@ -169,26 +159,15 @@ where
        }
    }

-    async fn add_block(&mut self, key: &Key, block: Bytes) -> Result<(), BasebackupError> {
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
        let (kind, segno, _) = key_to_slru_block(*key)?;

        match kind {
            SlruKind::Clog => {
-                if !(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8) {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid SlruKind::Clog record: block.len()={}",
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
            }
            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
-                if block.len() != BLCKSZ as usize {
-                    return Err(BasebackupError::Server(anyhow!(
-                        "invalid {:?} record: block.len()={}",
-                        kind,
-                        block.len()
-                    )));
-                }
+                ensure!(block.len() == BLCKSZ as usize);
            }
        }

@@ -215,15 +194,12 @@ where
        Ok(())
    }

-    async fn flush(&mut self) -> Result<(), BasebackupError> {
+    async fn flush(&mut self) -> anyhow::Result<()> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
-        self.ar
-            .append(&header, self.buf.as_slice())
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;

        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -233,7 +209,7 @@ where
        Ok(())
    }

-    async fn finish(mut self) -> Result<(), BasebackupError> {
+    async fn finish(mut self) -> anyhow::Result<()> {
        let res = if self.current_segment.is_none() || self.buf.is_empty() {
            Ok(())
        } else {
@@ -250,7 +226,7 @@ impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
 {
-    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
+    async fn send_tarball(mut self) -> anyhow::Result<()> {
        // TODO include checksum

        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
@@ -286,25 +262,16 @@ where
            let slru_partitions = self
                .timeline
                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-                .partition(
-                    self.timeline.get_shard_identity(),
-                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
-                );
+                .await?
+                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);

            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);

            for part in slru_partitions.parts {
-                let blocks = self
-                    .timeline
-                    .get_vectored(part, self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                let blocks = self.timeline.get_vectored(part, self.lsn, self.ctx).await?;

                for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
-                    slru_builder.add_block(&key, block).await?;
+                    slru_builder.add_block(&key, block?).await?;
                }
            }
            slru_builder.finish().await?;
@@ -312,11 +279,8 @@ where

        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;

@@ -325,8 +289,7 @@ where
            let rels = self
                .timeline
                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
@@ -349,12 +312,7 @@ where
                }
            }

-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
+            for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
                if path.starts_with("pg_replslot") {
                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                    let restart_lsn = Lsn(u64::from_le_bytes(
@@ -385,41 +343,34 @@ where
        for xid in self
            .timeline
            .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
        {
            self.add_twophase_file(xid).await?;
        }

        fail_point!("basebackup-before-control-file", |_| {
-            Err(BasebackupError::Server(anyhow!(
-                "failpoint basebackup-before-control-file"
-            )))
+            bail!("failpoint basebackup-before-control-file")
        });

        // Generate pg_control and bootstrap WAL segment.
        self.add_pgcontrol_file().await?;
-        self.ar.finish().await.map_err(BasebackupError::Client)?;
+        self.ar.finish().await?;
        debug!("all tarred up!");
        Ok(())
    }

    /// Add contents of relfilenode `src`, naming it as `dst`.
-    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
+            .await?;

        // If the relation is empty, create an empty file
        if nblocks == 0 {
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;
            return Ok(());
        }

@@ -433,18 +384,14 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
+                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }

            let file_name = dst.to_segfile_name(seg as u32);
            let header = new_tar_header(&file_name, segment_data.len() as u64)?;
-            self.ar
-                .append(&header, segment_data.as_slice())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, segment_data.as_slice()).await?;

            seg += 1;
            startblk = endblk;
@@ -464,22 +411,20 @@ where
        spcnode: u32,
        dbnode: u32,
        has_relmap_file: bool,
-    ) -> Result<(), BasebackupError> {
+    ) -> anyhow::Result<()> {
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;

-            if img.len()
-                != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
-            {
-                return Err(BasebackupError::Server(anyhow!(
-                    "img.len() != SIZE_OF_RELMAPFILE, img.len()={}",
-                    img.len(),
-                )));
-            }
+            ensure!(
+                img.len()
+                    == dispatch_pgversion!(
+                        self.timeline.pg_version,
+                        pgv::bindings::SIZEOF_RELMAPFILE
+                    )
+            );

            Some(img)
        } else {
@@ -492,20 +437,14 @@ where
                ver => format!("{ver}\x0A"),
            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
-            self.ar
-                .append(&header, pg_version_str.as_bytes())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, pg_version_str.as_bytes()).await?;

            info!("timeline.pg_version {}", self.timeline.pg_version);

            if let Some(img) = relmap_img {
                // filenode map for global tablespace
                let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -524,26 +463,18 @@ where
                && self
                    .timeline
                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                    .is_empty()
            {
                return Ok(());
            }
            // User defined tablespaces are not supported
-            if spcnode != DEFAULTTABLESPACE_OID {
-                return Err(BasebackupError::Server(anyhow!(
-                    "spcnode != DEFAULTTABLESPACE_OID, spcnode={spcnode}"
-                )));
-            }
+            ensure!(spcnode == DEFAULTTABLESPACE_OID);

            // Append dir path for each database
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
-            self.ar
-                .append(&header, &mut io::empty())
-                .await
-                .map_err(BasebackupError::Client)?;
+            self.ar.append(&header, &mut io::empty()).await?;

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -553,17 +484,11 @@ where
                    ver => format!("{ver}\x0A"),
                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
-                self.ar
-                    .append(&header, pg_version_str.as_bytes())
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, pg_version_str.as_bytes()).await?;

                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
-                self.ar
-                    .append(&header, &img[..])
-                    .await
-                    .map_err(BasebackupError::Client)?;
+                self.ar.append(&header, &img[..]).await?;
            }
        };
        Ok(())
@@ -572,12 +497,11 @@ where
    //
    // Extract twophase state files
    //
-    async fn add_twophase_file(&mut self, xid: TransactionId) -> Result<(), BasebackupError> {
+    async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
        let img = self
            .timeline
            .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;

        let mut buf = BytesMut::new();
        buf.extend_from_slice(&img[..]);
@@ -585,10 +509,7 @@ where
        buf.put_u32_le(crc);
        let path = format!("pg_twophase/{:>08X}", xid);
        let header = new_tar_header(&path, buf.len() as u64)?;
-        self.ar
-            .append(&header, &buf[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &buf[..]).await?;

        Ok(())
    }
@@ -597,28 +518,24 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
            if self.lsn == self.timeline.get_ancestor_lsn() {
-                write!(zenith_signal, "PREV LSN: none")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: none")?;
            } else {
-                write!(zenith_signal, "PREV LSN: invalid")
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                write!(zenith_signal, "PREV LSN: invalid")?;
            }
        } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
        }
        self.ar
            .append(
                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
                zenith_signal.as_bytes(),
            )
-            .await
-            .map_err(BasebackupError::Client)?;
+            .await?;

        let checkpoint_bytes = self
            .timeline
@@ -640,10 +557,7 @@ where

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
-        self.ar
-            .append(&header, &pg_control_bytes[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        self.ar.append(&header, &pg_control_bytes[..]).await?;

        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -658,16 +572,8 @@ where
            self.lsn,
        )
        .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
-        if wal_seg.len() != WAL_SEGMENT_SIZE {
-            return Err(BasebackupError::Server(anyhow!(
-                "wal_seg.len() != WAL_SEGMENT_SIZE, wal_seg.len()={}",
-                wal_seg.len()
-            )));
-        }
-        self.ar
-            .append(&header, &wal_seg[..])
-            .await
-            .map_err(BasebackupError::Client)?;
+        ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
+        self.ar.append(&header, &wal_seg[..]).await?;
        Ok(())
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -3,7 +3,6 @@
 //! Main entry point for the Page Server executable.

 use std::env::{var, VarError};
-use std::io::Read;
 use std::sync::Arc;
 use std::time::Duration;
 use std::{env, ops::ControlFlow, str::FromStr};
@@ -122,10 +121,8 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice and read path implementations
+    // after setting up logging, log the effective IO engine choice
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.get_impl, "starting with get page implementation");
-    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -152,34 +149,37 @@ fn initialize_config(
    workdir: &Utf8Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
    let init = arg_matches.get_flag("init");
+    let update_config = init || arg_matches.get_flag("update-config");

-    let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
-        Ok(mut f) => {
-            if init {
-                anyhow::bail!("config file already exists: {cfg_file_path}");
-            }
-            let md = f.metadata().context("stat config file")?;
-            if md.is_file() {
-                let mut s = String::new();
-                f.read_to_string(&mut s).context("read config file")?;
-                Some(s.parse().context("parse config file toml")?)
-            } else {
-                anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
-            }
-        }
-        Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
-        Err(e) => {
-            anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
+    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
+        if init {
+            anyhow::bail!(
+                "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
+            );
        }
+        // Supplement the CLI arguments with the config file
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
+            .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
+        (
+            cfg_file_contents
+                .parse::<toml_edit::Document>()
+                .with_context(|| {
+                    format!("Failed to parse '{cfg_file_path}' as pageserver config")
+                })?,
+            true,
+        )
+    } else if cfg_file_path.exists() {
+        anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
+    } else {
+        // We're initializing the tenant, so there's no config file yet
+        (
+            DEFAULT_CONFIG_FILE
+                .parse::<toml_edit::Document>()
+                .context("could not parse built-in config file")?,
+            false,
+        )
    };

-    let mut effective_config = file_contents.unwrap_or_else(|| {
-        DEFAULT_CONFIG_FILE
-            .parse()
-            .expect("unit tests ensure this works")
-    });
-
-    // Patch with overrides from the command line
    if let Some(values) = arg_matches.get_many::<String>("config-override") {
        for option_line in values {
            let doc = toml_edit::Document::from_str(option_line).with_context(|| {
@@ -187,21 +187,22 @@ fn initialize_config(
            })?;

            for (key, item) in doc.iter() {
-                effective_config.insert(key, item.clone());
+                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
+                    anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
+                }
+                toml.insert(key, item.clone());
            }
        }
    }

-    debug!("Resulting toml: {effective_config}");
-
-    // Construct the runtime representation
-    let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
+    debug!("Resulting toml: {toml}");
+    let conf = PageServerConf::parse_and_validate(&toml, workdir)
        .context("Failed to parse pageserver configuration")?;

-    if init {
+    if update_config {
        info!("Writing pageserver config to '{cfg_file_path}'");

-        std::fs::write(cfg_file_path, effective_config.to_string())
+        std::fs::write(cfg_file_path, toml.to_string())
            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
        info!("Config successfully written to '{cfg_file_path}'")
    }
@@ -755,13 +756,18 @@ fn cli() -> Command {
        // See `settings.md` for more details on the extra configuration patameters pageserver can process
        .arg(
            Arg::new("config-override")
-                .long("config-override")
                .short('c')
                .num_args(1)
                .action(ArgAction::Append)
                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
+        .arg(
+            Arg::new("update-config")
+                .long("update-config")
+                .action(ArgAction::SetTrue)
+                .help("Update the config file when started"),
+        )
        .arg(
            Arg::new("enabled-features")
                .long("enabled-features")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -9,7 +9,7 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
-use std::env;
+use std::{collections::HashMap, env};
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -30,9 +30,9 @@ use utils::{
    logging::LogFormat,
 };

+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
-use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -51,7 +51,7 @@ pub mod defaults {
    use crate::tenant::config::defaults::*;
    use const_format::formatcp;

-    pub use pageserver_api::config::{
+    pub use pageserver_api::{
        DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
        DEFAULT_PG_LISTEN_PORT,
    };
@@ -91,8 +91,6 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_GET_IMPL: &str = "legacy";
-
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
@@ -140,8 +138,6 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

-#get_impl = '{DEFAULT_GET_IMPL}'
-
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -288,8 +284,6 @@ pub struct PageServerConf {

    pub get_vectored_impl: GetVectoredImpl,

-    pub get_impl: GetImpl,
-
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
@@ -335,6 +329,26 @@ impl<T: Clone> BuilderValue<T> {
    }
 }

+// Certain metadata (e.g. externally-addressable name, AZ) is delivered
+// as a separate structure.  This information is not neeed by the pageserver
+// itself, it is only used for registering the pageserver with the control
+// plane and/or storage controller.
+//
+#[derive(serde::Deserialize)]
+pub(crate) struct NodeMetadata {
+    #[serde(rename = "host")]
+    pub(crate) postgres_host: String,
+    #[serde(rename = "port")]
+    pub(crate) postgres_port: u16,
+    pub(crate) http_host: String,
+    pub(crate) http_port: u16,
+
+    // Deployment tools may write fields to the metadata file beyond what we
+    // use in this type: this type intentionally only names fields that require.
+    #[serde(flatten)]
+    pub(crate) other: HashMap<String, serde_json::Value>,
+}
+
 // needed to simplify config construction
 #[derive(Default)]
 struct PageServerConfigBuilder {
@@ -400,8 +414,6 @@ struct PageServerConfigBuilder {

    get_vectored_impl: BuilderValue<GetVectoredImpl>,

-    get_impl: BuilderValue<GetImpl>,
-
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,
@@ -491,7 +503,6 @@ impl PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
@@ -670,10 +681,6 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

-    pub fn get_impl(&mut self, value: GetImpl) {
-        self.get_impl = BuilderValue::Set(value);
-    }
-
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }
@@ -743,7 +750,6 @@ impl PageServerConfigBuilder {
                secondary_download_concurrency,
                ingest_batch_size,
                get_vectored_impl,
-                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
@@ -1029,9 +1035,6 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
-                "get_impl" => {
-                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
-                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
@@ -1123,7 +1126,6 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
@@ -1363,7 +1365,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
@@ -1437,7 +1438,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
@@ -1557,7 +1557,6 @@ broker_endpoint = '{broker_endpoint}'
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
-                        upload_storage_class: None,
                    }),
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -14,8 +14,10 @@ use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};

-use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
-use pageserver_api::config::NodeMetadata;
+use crate::{
+    config::{NodeMetadata, PageServerConf},
+    virtual_file::on_fatal_io_error,
+};

 /// The Pageserver's client for using the control plane API: this is a small subset
 /// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
@@ -63,7 +65,7 @@ impl ControlPlaneClient {
        let mut client = reqwest::ClientBuilder::new();

        if let Some(jwt) = &conf.control_plane_api_token {
-            let mut headers = reqwest::header::HeaderMap::new();
+            let mut headers = hyper::HeaderMap::new();
            headers.insert(
                "Authorization",
                format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -38,7 +38,7 @@ use deleter::DeleterMessage;
 use list_writer::ListWriterQueueMessage;
 use validator::ValidatorQueueMessage;

-use crate::{config::PageServerConf, tenant::storage_layer::LayerName};
+use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

 // TODO: configurable for how long to wait before executing deletions

@@ -479,7 +479,7 @@ impl DeletionQueueClient {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
@@ -511,7 +511,7 @@ impl DeletionQueueClient {
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
@@ -734,20 +734,20 @@ mod test {
    use crate::{
        control_plane_client::RetryForeverError,
        repository::Key,
-        tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
+        tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
    };

    use super::*;
    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));

-    pub const EXAMPLE_LAYER_NAME: LayerName = LayerName::Delta(DeltaLayerName {
+    pub const EXAMPLE_LAYER_NAME: LayerFileName = LayerFileName::Delta(DeltaFileName {
        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
        lsn_range: Lsn(0x00000000016B59D8)..Lsn(0x00000000016B5A51),
    });

    // When you need a second layer in a test.
-    pub const EXAMPLE_LAYER_NAME_ALT: LayerName = LayerName::Delta(DeltaLayerName {
+    pub const EXAMPLE_LAYER_NAME_ALT: LayerFileName = LayerFileName::Delta(DeltaFileName {
        key_range: Key::from_i128(0x0)..Key::from_i128(0xFFFFFFFFFFFFFFFF),
        lsn_range: Lsn(0x00000000016B5A51)..Lsn(0x00000000016B5A61),
    });
@@ -797,7 +797,7 @@ mod test {
        /// Returns remote layer file name, suitable for use in assert_remote_files
        fn write_remote_layer(
            &self,
-            file_name: LayerName,
+            file_name: LayerFileName,
            gen: Generation,
        ) -> anyhow::Result<String> {
            let tenant_shard_id = self.harness.tenant_shard_id;
@@ -952,7 +952,7 @@ mod test {
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

-        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_shard_id = ctx.harness.tenant_shard_id;

        let content: Vec<u8> = "victim1 contents".into();
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -34,7 +34,7 @@ use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
-use crate::tenant::storage_layer::LayerName;
+use crate::tenant::storage_layer::LayerFileName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;

@@ -59,7 +59,7 @@ pub(super) struct DeletionOp {
    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
    // have a config object handy to project it to a remote key, and need the consuming worker
    // to do it for you.
-    pub(super) layers: Vec<(LayerName, LayerFileMetadata)>,
+    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
    pub(super) objects: Vec<RemotePath>,

    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -64,7 +64,7 @@ use crate::{
        mgr::TenantManager,
        remote_timeline_client::LayerFileMetadata,
        secondary::SecondaryTenant,
-        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
    },
 };

@@ -540,12 +540,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    js.spawn(async move {
                        layer
                            .secondary_tenant
-                            .evict_layer(
-                                tenant_manager.get_conf(),
-                                layer.timeline_id,
-                                layer.name,
-                                layer.metadata,
-                            )
+                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
                            .await;
                        Ok(file_size)
                    });
@@ -604,7 +599,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 pub(crate) struct EvictionSecondaryLayer {
    pub(crate) secondary_tenant: Arc<SecondaryTenant>,
    pub(crate) timeline_id: TimelineId,
-    pub(crate) name: LayerName,
+    pub(crate) name: LayerFileName,
    pub(crate) metadata: LayerFileMetadata,
 }

@@ -637,9 +632,9 @@ impl EvictionLayer {
        }
    }

-    pub(crate) fn get_name(&self) -> LayerName {
+    pub(crate) fn get_name(&self) -> LayerFileName {
        match self {
-            Self::Attached(l) => l.layer_desc().layer_name(),
+            Self::Attached(l) => l.layer_desc().filename(),
            Self::Secondary(sl) => sl.name.clone(),
        }
    }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -782,6 +782,9 @@ components:
      required:
        - mode
      properties:
+        tenant_id:
+          type: string
+          description: Not used, scheduled for removal.
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -19,8 +19,6 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
-use pageserver_api::models::TenantScanRemoteStorageResponse;
-use pageserver_api::models::TenantScanRemoteStorageShard;
 use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
@@ -31,7 +29,6 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::ShardCount;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,13 +54,9 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::remote_timeline_client;
-use crate::tenant::remote_timeline_client::download_index_part;
-use crate::tenant::remote_timeline_client::list_remote_tenant_shards;
-use crate::tenant::remote_timeline_client::list_remote_timelines;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::SpawnMode;
@@ -167,9 +160,6 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
-            PageReconstructError::MissingKey(e) => {
-                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
-            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -1229,15 +1219,13 @@ async fn layer_download_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let layer_name = LayerName::from_str(layer_file_name)
-        .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
    let state = get_state(&request);

    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
    let downloaded = timeline
-        .download_layer(&layer_name)
+        .download_layer(layer_file_name)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1261,14 +1249,11 @@ async fn evict_timeline_layer_handler(
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
    let state = get_state(&request);

-    let layer_name = LayerName::from_str(layer_file_name)
-        .map_err(|s| ApiError::BadRequest(anyhow::anyhow!(s)))?;
-
    let timeline =
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
    let evicted = timeline
-        .evict_layer(&layer_name)
+        .evict_layer(layer_file_name)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1833,75 +1818,6 @@ async fn timeline_download_remote_layers_handler_get(
    json_response(StatusCode::OK, info)
 }

-async fn timeline_detach_ancestor_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    use crate::tenant::timeline::detach_ancestor::Options;
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-
-    let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
-
-    async move {
-        let mut options = Options::default();
-
-        let rewrite_concurrency =
-            parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
-        let copy_concurrency =
-            parse_query_param::<_, std::num::NonZeroUsize>(&request, "copy_concurrency")?;
-
-        [
-            (&mut options.rewrite_concurrency, rewrite_concurrency),
-            (&mut options.copy_concurrency, copy_concurrency),
-        ]
-        .into_iter()
-        .filter_map(|(target, val)| val.map(|val| (target, val)))
-        .for_each(|(target, val)| *target = val);
-
-        let state = get_state(&request);
-
-        let tenant = state
-            .tenant_manager
-            .get_attached_tenant_shard(tenant_shard_id)?;
-
-        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-
-        let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
-        let ctx = &ctx;
-
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| ApiError::NotFound(e.into()))?;
-
-        let (_guard, prepared) = timeline
-            .prepare_to_detach_from_ancestor(&tenant, options, ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-
-        let res = state
-            .tenant_manager
-            .complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
-            .await;
-
-        match res {
-            Ok(reparented_timelines) => {
-                let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
-                    reparented_timelines,
-                };
-
-                json_response(StatusCode::OK, resp)
-            }
-            Err(e) => Err(ApiError::InternalServerError(
-                e.context("timeline detach completion"),
-            )),
-        }
-    }
-    .instrument(span)
-    .await
-}
-
 async fn deletion_queue_flush(
    r: Request<Body>,
    cancel: CancellationToken,
@@ -1993,14 +1909,12 @@ async fn timeline_collect_keyspace(
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let (dense_ks, sparse_ks) = timeline
+        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;

-        // This API is currently used by pagebench. Pagebench will iterate all keys within the keyspace.
-        // Therefore, we split dense/sparse keys in this API.
-        let res = pageserver_api::models::partitioning::Partitioning { keys: dense_ks, sparse_keys: sparse_ks, at_lsn };
+        let res = pageserver_api::models::partitioning::Partitioning { keys, at_lsn };

        json_response(StatusCode::OK, res)
    }
@@ -2118,79 +2032,6 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_scan_remote_handler(
-    request: Request<Body>,
-    cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-
-    let Some(remote_storage) = state.remote_storage.as_ref() else {
-        return Err(ApiError::BadRequest(anyhow::anyhow!(
-            "Remote storage not configured"
-        )));
-    };
-
-    let mut response = TenantScanRemoteStorageResponse::default();
-
-    let (shards, _other_keys) =
-        list_remote_tenant_shards(remote_storage, tenant_id, cancel.clone())
-            .await
-            .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-    for tenant_shard_id in shards {
-        let (timeline_ids, _other_keys) =
-            list_remote_timelines(remote_storage, tenant_shard_id, cancel.clone())
-                .await
-                .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-
-        let mut generation = Generation::none();
-        for timeline_id in timeline_ids {
-            match download_index_part(
-                remote_storage,
-                &tenant_shard_id,
-                &timeline_id,
-                Generation::MAX,
-                &cancel,
-            )
-            .instrument(info_span!("download_index_part",
-                         tenant_id=%tenant_shard_id.tenant_id,
-                         shard_id=%tenant_shard_id.shard_slug(),
-                         %timeline_id))
-            .await
-            {
-                Ok((index_part, index_generation)) => {
-                    tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
-                        index_part.layer_metadata.len(), index_part.get_disk_consistent_lsn());
-                    generation = std::cmp::max(generation, index_generation);
-                }
-                Err(DownloadError::NotFound) => {
-                    // This is normal for tenants that were created with multiple shards: they have an unsharded path
-                    // containing the timeline's initdb tarball but no index.  Otherwise it is a bit strange.
-                    tracing::info!("Timeline path {tenant_shard_id}/{timeline_id} exists in remote storage but has no index, skipping");
-                    continue;
-                }
-                Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
-            };
-        }
-
-        response.shards.push(TenantScanRemoteStorageShard {
-            tenant_shard_id,
-            generation: generation.into(),
-        });
-    }
-
-    if response.shards.is_empty() {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("No shards found for tenant ID {tenant_id}").into(),
-        ));
-    }
-
-    json_response(StatusCode::OK, response)
-}
-
 async fn secondary_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2235,27 +2076,6 @@ async fn secondary_download_handler(
    json_response(status, progress)
 }

-async fn secondary_status_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let state = get_state(&request);
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-
-    let Some(secondary_tenant) = state
-        .tenant_manager
-        .get_secondary_tenant_shard(tenant_shard_id)
-    else {
-        return Err(ApiError::NotFound(
-            anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
-        ));
-    };
-
-    let progress = secondary_tenant.progress.lock().unwrap().clone();
-
-    json_response(StatusCode::OK, progress)
-}
-
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -2590,10 +2410,6 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
-            |r| api_handler(r, timeline_detach_ancestor_handler),
-        )
        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
@@ -2612,18 +2428,12 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
            api_handler(r, secondary_upload_handler)
        })
-        .get("/v1/tenant/:tenant_id/scan_remote_storage", |r| {
-            api_handler(r, tenant_scan_remote_handler)
-        })
        .put("/v1/disk_usage_eviction/run", |r| {
            api_handler(r, disk_usage_eviction_run)
        })
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .get("/v1/tenant/:tenant_shard_id/secondary/status", |r| {
-            api_handler(r, secondary_status_handler)
-        })
        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
            api_handler(r, secondary_download_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,7 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
-pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,7 +9,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
-use strum_macros::{IntoStaticStr, VariantNames};
+use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;

@@ -27,7 +27,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 ];

 // Metrics collected on operations on the storage repository.
-#[derive(Debug, VariantNames, IntoStaticStr)]
+#[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
@@ -51,9 +51,6 @@ pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "gc")]
    Gc,

-    #[strum(serialize = "find gc cutoffs")]
-    FindGcCutoffs,
-
    #[strum(serialize = "create tenant")]
    CreateTenant,
 }
@@ -89,58 +86,41 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
-pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_vectored_read_global",
-        "Average number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-#[derive(
-    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
-)]
-pub(crate) enum GetKind {
-    Singular,
-    Vectored,
-}

 pub(crate) struct ReconstructTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
+    ok: Histogram,
+    err: Histogram,
 }

 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["get_kind"],
+        &["result"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric");
-
    ReconstructTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
+        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
    }
 });

 impl ReconstructTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
+    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
+        match result {
+            Ok(_) => &self.ok,
+            Err(_) => &self.err,
        }
    }
 }
@@ -153,33 +133,13 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) struct ReconstructDataTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-impl ReconstructDataTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric");
-
-    ReconstructDataTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
+    .expect("failed to define a metric")
 });

 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -194,11 +154,6 @@ pub(crate) struct GetVectoredLatency {
    map: EnumMap<TaskKind, Option<Histogram>>,
 }

-#[allow(dead_code)]
-pub(crate) struct ScanLatency {
-    map: EnumMap<TaskKind, Option<Histogram>>,
-}
-
 impl GetVectoredLatency {
    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
    // cardinality of the metric.
@@ -209,48 +164,6 @@ impl GetVectoredLatency {
    }
 }

-impl ScanLatency {
-    // Only these task types perform vectored gets. Filter all other tasks out to reduce total
-    // cardinality of the metric.
-    const TRACKED_TASK_KINDS: [TaskKind; 1] = [TaskKind::PageRequestHandler];
-
-    pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> {
-        self.map[task_kind].as_ref()
-    }
-}
-
-pub(crate) struct ScanLatencyOngoingRecording<'a> {
-    parent: &'a Histogram,
-    start: std::time::Instant,
-}
-
-impl<'a> ScanLatencyOngoingRecording<'a> {
-    pub(crate) fn start_recording(parent: &'a Histogram) -> ScanLatencyOngoingRecording<'a> {
-        let start = Instant::now();
-        ScanLatencyOngoingRecording { parent, start }
-    }
-
-    pub(crate) fn observe(self, throttled: Option<Duration>) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = if let Some(throttled) = throttled {
-            elapsed.checked_sub(throttled)
-        } else {
-            Some(elapsed)
-        };
-        if let Some(ex_throttled) = ex_throttled {
-            self.parent.observe(ex_throttled.as_secs_f64());
-        } else {
-            use utils::rate_limit::RateLimit;
-            static LOGGED: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-            let mut rate_limit = LOGGED.lock().unwrap();
-            rate_limit.call(|| {
-                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-            });
-        }
-    }
-}
-
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_get_vectored_seconds",
@@ -274,29 +187,6 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
    }
 });

-pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
-        "pageserver_scan_seconds",
-        "Time spent in scan, excluding time spent in timeline_get_throttle.",
-        &["task_kind"],
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric");
-
-    ScanLatency {
-        map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| {
-            let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind_idx);
-
-            if ScanLatency::TRACKED_TASK_KINDS.contains(&task_kind) {
-                let task_kind = task_kind.into();
-                Some(inner.with_label_values(&[task_kind]))
-            } else {
-                None
-            }
-        })),
-    }
-});
-
 pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,
@@ -1512,80 +1402,29 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
 });

 pub(crate) struct TenantManagerMetrics {
-    tenant_slots_attached: UIntGauge,
-    tenant_slots_secondary: UIntGauge,
-    tenant_slots_inprogress: UIntGauge,
+    pub(crate) tenant_slots: UIntGauge,
    pub(crate) tenant_slot_writes: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
 }

-impl TenantManagerMetrics {
-    /// Helpers for tracking slots.  Note that these do not track the lifetime of TenantSlot objects
-    /// exactly: they track the lifetime of the slots _in the tenant map_.
-    pub(crate) fn slot_inserted(&self, slot: &TenantSlot) {
-        match slot {
-            TenantSlot::Attached(_) => {
-                self.tenant_slots_attached.inc();
-            }
-            TenantSlot::Secondary(_) => {
-                self.tenant_slots_secondary.inc();
-            }
-            TenantSlot::InProgress(_) => {
-                self.tenant_slots_inprogress.inc();
-            }
-        }
-    }
-
-    pub(crate) fn slot_removed(&self, slot: &TenantSlot) {
-        match slot {
-            TenantSlot::Attached(_) => {
-                self.tenant_slots_attached.dec();
-            }
-            TenantSlot::Secondary(_) => {
-                self.tenant_slots_secondary.dec();
-            }
-            TenantSlot::InProgress(_) => {
-                self.tenant_slots_inprogress.dec();
-            }
-        }
-    }
-
-    #[cfg(all(debug_assertions, not(test)))]
-    pub(crate) fn slots_total(&self) -> u64 {
-        self.tenant_slots_attached.get()
-            + self.tenant_slots_secondary.get()
-            + self.tenant_slots_inprogress.get()
-    }
-}
-
 pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
-    let tenant_slots = register_uint_gauge_vec!(
+    TenantManagerMetrics {
+    tenant_slots: register_uint_gauge!(
        "pageserver_tenant_manager_slots",
        "How many slots currently exist, including all attached, secondary and in-progress operations",
-        &["mode"]
    )
-    .expect("failed to define a metric");
-    TenantManagerMetrics {
-        tenant_slots_attached: tenant_slots
-            .get_metric_with_label_values(&["attached"])
-            .unwrap(),
-        tenant_slots_secondary: tenant_slots
-            .get_metric_with_label_values(&["secondary"])
-            .unwrap(),
-        tenant_slots_inprogress: tenant_slots
-            .get_metric_with_label_values(&["inprogress"])
-            .unwrap(),
-        tenant_slot_writes: register_int_counter!(
-            "pageserver_tenant_manager_slot_writes",
-            "Writes to a tenant slot, including all of create/attach/detach/delete"
-        )
-        .expect("failed to define a metric"),
-        unexpected_errors: register_int_counter!(
-            "pageserver_tenant_manager_unexpected_errors_total",
-            "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
-        )
-        .expect("failed to define a metric"),
-    }
+    .expect("failed to define a metric"),
+    tenant_slot_writes: register_int_counter!(
+        "pageserver_tenant_manager_slot_writes",
+        "Writes to a tenant slot, including all of create/attach/detach/delete"
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_tenant_manager_unexpected_errors_total",
+        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
+    )
+    .expect("failed to define a metric"),
+}
 });

 pub(crate) struct DeletionQueueMetrics {
@@ -1643,6 +1482,35 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });

+pub(crate) struct WalIngestMetrics {
+    pub(crate) bytes_received: IntCounter,
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    bytes_received: register_int_counter!(
+        "pageserver_wal_ingest_bytes_received",
+        "Bytes of WAL ingested from safekeepers",
+    )
+    .unwrap(),
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeepers"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+});
 pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
@@ -1844,43 +1712,6 @@ macro_rules! redo_bytes_histogram_count_buckets {
    };
 }

-pub(crate) struct WalIngestMetrics {
-    pub(crate) bytes_received: IntCounter,
-    pub(crate) records_received: IntCounter,
-    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
-    pub(crate) time_spent_on_ingest: Histogram,
-}
-
-pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
-    bytes_received: register_int_counter!(
-        "pageserver_wal_ingest_bytes_received",
-        "Bytes of WAL ingested from safekeepers",
-    )
-    .unwrap(),
-    records_received: register_int_counter!(
-        "pageserver_wal_ingest_records_received",
-        "Number of WAL records received from safekeepers"
-    )
-    .expect("failed to define a metric"),
-    records_committed: register_int_counter!(
-        "pageserver_wal_ingest_records_committed",
-        "Number of WAL records which resulted in writes to pageserver storage"
-    )
-    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
-    time_spent_on_ingest: register_histogram!(
-        "pageserver_wal_ingest_put_value_seconds",
-        "Actual time spent on ingesting a record",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric"),
-});
-
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_seconds",
@@ -2034,22 +1865,6 @@ impl StorageTimeMetricsTimer {
        self.metrics.timeline_count.inc();
        self.metrics.global_histogram.observe(duration);
    }
-
-    /// Turns this timer into a timer, which will always record -- usually this means recording
-    /// regardless an early `?` path was taken in a function.
-    pub(crate) fn record_on_drop(self) -> AlwaysRecordingStorageTimeMetricsTimer {
-        AlwaysRecordingStorageTimeMetricsTimer(Some(self))
-    }
-}
-
-pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
-
-impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
-    fn drop(&mut self) {
-        if let Some(inner) = self.0.take() {
-            inner.stop_and_record();
-        }
-    }
 }

 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
@@ -2110,7 +1925,6 @@ pub(crate) struct TimelineMetrics {
    pub imitate_logical_size_histo: StorageTimeMetrics,
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
-    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
@@ -2171,12 +1985,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let find_gc_cutoffs_histo = StorageTimeMetrics::new(
-            StorageTimeOperation::FindGcCutoffs,
-            &tenant_id,
-            &shard_id,
-            &timeline_id,
-        );
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2219,7 +2027,6 @@ impl TimelineMetrics {
            logical_size_histo,
            imitate_logical_size_histo,
            garbage_collect_histo,
-            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
            resident_physical_size_gauge,
@@ -2326,7 +2133,6 @@ use std::time::{Duration, Instant};

 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
-use crate::tenant::mgr::TenantSlot;

 /// Maintain a per timeline gauge in addition to the global gauge.
 struct PerTimelineRemotePhysicalSizeGauge {
@@ -2929,8 +2735,6 @@ pub fn preinitialize_metrics() {
        &WALRECEIVER_CANDIDATES_REMOVED,
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
        &tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
-        &REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
-        &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
    ]
    .into_iter()
    .for_each(|c| {
@@ -2967,8 +2771,7 @@ pub fn preinitialize_metrics() {

    // histograms
    [
-        &READ_NUM_LAYERS_VISITED,
-        &VEC_READ_NUM_LAYERS_VISITED,
+        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,5 +1,13 @@
+//
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
+//
+//   It is possible to connect here using usual psql/pgbench/libpq. Following
+// commands are supported now:
+//     *status* -- show actual info about this pageserver,
+//     *pagestream* -- enter mode where smgr and pageserver talk with their
+//  custom protocol.
+//

 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
@@ -15,7 +23,7 @@ use pageserver_api::models::{
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
-    PagestreamNblocksResponse, PagestreamProtocolVersion,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::ShardNumber;
@@ -48,7 +56,6 @@ use utils::{

 use crate::auth::check_permission;
 use crate::basebackup;
-use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
@@ -544,7 +551,6 @@ impl PageServerHandler {
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        protocol_version: PagestreamProtocolVersion,
        ctx: RequestContext,
    ) -> Result<(), QueryError>
    where
@@ -607,15 +613,14 @@ impl PageServerHandler {
                t.trace(&copy_data_bytes)
            }

-            let neon_fe_msg =
-                PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;

            // TODO: We could create a new per-request context here, with unique ID.
            // Currently we use the same per-timeline context for all requests

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -624,7 +629,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -634,7 +639,7 @@ impl PageServerHandler {
                }
                PagestreamFeMessage::GetPage(req) => {
                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -643,7 +648,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -652,7 +657,7 @@ impl PageServerHandler {
                    )
                }
                PagestreamFeMessage::GetSlruSegment(req) => {
-                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
                    (
                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
@@ -833,80 +838,78 @@ impl PageServerHandler {
    /// Helper function to handle the LSN from client request.
    ///
    /// Each GetPage (and Exists and Nblocks) request includes information about
-    /// which version of the page is being requested. The primary compute node
-    /// will always request the latest page version, by setting 'request_lsn' to
-    /// the last inserted or flushed WAL position, while a standby will request
-    /// a version at the LSN that it's currently caught up to.
+    /// which version of the page is being requested. The client can request the
+    /// latest version of the page, or the version that's valid at a particular
+    /// LSN. The primary compute node will always request the latest page
+    /// version, while a standby will request a version at the LSN that it's
+    /// currently caught up to.
    ///
    /// In either case, if the page server hasn't received the WAL up to the
    /// requested LSN yet, we will wait for it to arrive. The return value is
    /// the LSN that should be used to look up the page versions.
-    ///
-    /// In addition to the request LSN, each request carries another LSN,
-    /// 'not_modified_since', which is a hint to the pageserver that the client
-    /// knows that the page has not been modified between 'not_modified_since'
-    /// and the request LSN. This allows skipping the wait, as long as the WAL
-    /// up to 'not_modified_since' has arrived. If the client doesn't have any
-    /// information about when the page was modified, it will use
-    /// not_modified_since == lsn. If the client lies and sends a too low
-    /// not_modified_hint such that there are in fact later page versions, the
-    /// behavior is undefined: the pageserver may return any of the page versions
-    /// or an error.
    async fn wait_or_get_last_lsn(
        timeline: &Timeline,
-        request_lsn: Lsn,
-        not_modified_since: Lsn,
+        mut lsn: Lsn,
+        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Lsn, PageStreamError> {
-        let last_record_lsn = timeline.get_last_record_lsn();
+        if latest {
+            // Latest page version was requested. If LSN is given, it is a hint
+            // to the page server that there have been no modifications to the
+            // page after that LSN. If we haven't received WAL up to that point,
+            // wait until it arrives.
+            let last_record_lsn = timeline.get_last_record_lsn();

-        // Sanity check the request
-        if request_lsn < not_modified_since {
-            return Err(PageStreamError::BadRequest(
-                format!(
-                    "invalid request with request LSN {} and not_modified_since {}",
-                    request_lsn, not_modified_since,
-                )
-                .into(),
-            ));
-        }
-
-        if request_lsn < **latest_gc_cutoff_lsn {
-            // Check explicitly for INVALID just to get a less scary error message if the
-            // request is obviously bogus
-            return Err(if request_lsn == Lsn::INVALID {
-                PageStreamError::BadRequest("invalid LSN(0) in request".into())
+            // Note: this covers the special case that lsn == Lsn(0). That
+            // special case means "return the latest version whatever it is",
+            // and it's used for bootstrapping purposes, when the page server is
+            // connected directly to the compute node. That is needed because
+            // when you connect to the compute node, to receive the WAL, the
+            // walsender process will do a look up in the pg_authid catalog
+            // table for authentication. That poses a deadlock problem: the
+            // catalog table lookup will send a GetPage request, but the GetPage
+            // request will block in the page server because the recent WAL
+            // hasn't been received yet, and it cannot be received until the
+            // walsender completes the authentication and starts streaming the
+            // WAL.
+            if lsn <= last_record_lsn {
+                lsn = last_record_lsn;
            } else {
-                PageStreamError::BadRequest(format!(
-                        "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-                        request_lsn, **latest_gc_cutoff_lsn
-                    ).into())
-            });
-        }
-
-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
-        if not_modified_since > last_record_lsn {
+                timeline
+                    .wait_lsn(
+                        lsn,
+                        crate::tenant::timeline::WaitLsnWaiter::PageService,
+                        ctx,
+                    )
+                    .await?;
+                // Since we waited for 'lsn' to arrive, that is now the last
+                // record LSN. (Or close enough for our purposes; the
+                // last-record LSN can advance immediately after we return
+                // anyway)
+            }
+        } else {
+            if lsn == Lsn(0) {
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
+            }
            timeline
                .wait_lsn(
-                    not_modified_since,
+                    lsn,
                    crate::tenant::timeline::WaitLsnWaiter::PageService,
                    ctx,
                )
                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
-            Ok(not_modified_since)
-        } else {
-            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
-            // here instead. That would give the same result, since we know that there
-            // haven't been any modifications since 'not_modified_since'. Using an older
-            // LSN might be faster, because that could allow skipping recent layers when
-            // finding the page. However, we have historically used 'last_record_lsn', so
-            // stick to that for now.
-            Ok(std::cmp::min(last_record_lsn, request_lsn))
        }
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
+        Ok(lsn)
    }

    #[instrument(skip_all, fields(shard_id))]
@@ -923,17 +926,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -956,17 +954,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -989,17 +982,18 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -1166,17 +1160,12 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetPageAtLsn, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), ctx)
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -1199,14 +1188,9 @@ impl PageServerHandler {
            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);

        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(
-            timeline,
-            req.request_lsn,
-            req.not_modified_since,
-            &latest_gc_cutoff_lsn,
-            ctx,
-        )
-        .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;

        let kind = SlruKind::from_repr(req.kind)
            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
@@ -1217,10 +1201,6 @@ impl PageServerHandler {
        ))
    }

-    /// Note on "fullbackup":
-    /// Full basebackups should only be used for debugging purposes.
-    /// Originally, it was introduced to enable breaking storage format changes,
-    /// but that is not applicable anymore.
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
@@ -1237,13 +1217,6 @@ impl PageServerHandler {
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
-        fn map_basebackup_error(err: BasebackupError) -> QueryError {
-            match err {
-                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
-                BasebackupError::Server(e) => QueryError::Other(e),
-            }
-        }
-
        let started = std::time::Instant::now();

        // check that the timeline exists
@@ -1269,8 +1242,7 @@ impl PageServerHandler {
        let lsn_awaited_after = started.elapsed();

        // switch client to COPYOUT
-        pgb.write_message_noflush(&BeMessage::CopyOutResponse)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
@@ -1285,8 +1257,7 @@ impl PageServerHandler {
                full_backup,
                ctx,
            )
-            .await
-            .map_err(map_basebackup_error)?;
+            .await?;
        } else {
            let mut writer = pgb.copyout_writer();
            if gzip {
@@ -1307,13 +1278,9 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
                // shutdown the encoder to ensure the gzip footer is written
-                encoder
-                    .shutdown()
-                    .await
-                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
+                encoder.shutdown().await?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -1323,13 +1290,11 @@ impl PageServerHandler {
                    full_backup,
                    ctx,
                )
-                .await
-                .map_err(map_basebackup_error)?;
+                .await?;
            }
        }

-        pgb.write_message_noflush(&BeMessage::CopyDone)
-            .map_err(QueryError::Disconnected)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        self.flush_cancellable(pgb, &timeline.cancel).await?;

        let basebackup_after = started
@@ -1439,34 +1404,7 @@ where

        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-        if query_string.starts_with("pagestream_v2 ") {
-            let (_, params_raw) = query_string.split_at("pagestream_v2 ".len());
-            let params = params_raw.split(' ').collect::<Vec<_>>();
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V2,
-                ctx,
-            )
-            .await?;
-        } else if query_string.starts_with("pagestream ") {
+        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
            if params.len() != 2 {
@@ -1485,14 +1423,8 @@ where

            self.check_permission(Some(tenant_id))?;

-            self.handle_pagerequests(
-                pgb,
-                tenant_id,
-                timeline_id,
-                PagestreamProtocolVersion::V1,
-                ctx,
-            )
-            .await?;
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
+                .await?;
        } else if query_string.starts_with("basebackup ") {
            let (_, params_raw) = query_string.split_at("basebackup ".len());
            let params = params_raw.split_whitespace().collect::<Vec<_>>();
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,10 +9,9 @@
 use super::tenant::{PageReconstructError, Timeline};
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::metrics::WAL_INGEST;
+use crate::repository::*;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
 use crate::walrecord::NeonWalRecord;
-use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
@@ -23,8 +22,6 @@ use pageserver_api::key::{
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
-use pageserver_api::keyspace::SparseKeySpace;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -178,6 +175,7 @@ impl Timeline {
        tag: RelTag,
        blknum: BlockNumber,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if tag.relnode == 0 {
@@ -186,7 +184,7 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, version, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -208,6 +206,7 @@ impl Timeline {
        spcnode: Oid,
        dbnode: Oid,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;
@@ -215,7 +214,7 @@ impl Timeline {
        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
@@ -226,6 +225,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
@@ -239,7 +239,7 @@ impl Timeline {
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -252,8 +252,16 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-
+        if latest {
+            // Update relation size cache only if "latest" flag is set.
+            // This flag is set by compute when it is working with most recent version of relation.
+            // Typically master compute node always set latest=true.
+            // Please notice, that even if compute node "by mistake" specifies old LSN but set
+            // latest=true, then it can not cause cache corruption, because with latest=true
+            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
+            // associated with most recent value of LSN.
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+        }
        Ok(nblocks)
    }

@@ -262,6 +270,7 @@ impl Timeline {
        &self,
        tag: RelTag,
        version: Version<'_>,
+        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
@@ -280,7 +289,7 @@ impl Timeline {

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
-                let exists = dir.rels.contains(&(tag.relnode, tag.forknum));
+                let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
                Ok(exists)
            }
            Err(e) => Err(PageReconstructError::from(e)),
@@ -380,7 +389,7 @@ impl Timeline {

        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
-                let exists = dir.segments.contains(&segno);
+                let exists = dir.segments.get(&segno).is_some();
                Ok(exists)
            }
            Err(e) => Err(PageReconstructError::from(e)),
@@ -457,12 +466,6 @@ impl Timeline {
                // Didn't find any commit timestamps smaller than the request
                Ok(LsnForTimestamp::Past(min_lsn))
            }
-            (true, _) if commit_lsn < min_lsn => {
-                // the search above did set found_smaller to true but it never increased the lsn.
-                // Then, low is still the old min_lsn, and the subtraction above gave a value
-                // below the min_lsn. We should never do that.
-                Ok(LsnForTimestamp::Past(min_lsn))
-            }
            (true, false) => {
                // Only found commits with timestamps smaller than the request.
                // It's still a valid case for branch creation, return it.
@@ -671,7 +674,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    async fn list_aux_files_v1(
+    pub(crate) async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -689,63 +692,6 @@ impl Timeline {
        }
    }

-    async fn list_aux_files_v2(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        let kv = self
-            .scan(KeySpace::single(Key::metadata_aux_key_range()), lsn, ctx)
-            .await
-            .context("scan")?;
-        let mut result = HashMap::new();
-        for (_, v) in kv {
-            let v = v.context("get value")?;
-            let v = aux_file::decode_file_value_bytes(&v).context("value decode")?;
-            for (fname, content) in v {
-                result.insert(fname, content);
-            }
-        }
-        Ok(result)
-    }
-
-    pub(crate) async fn list_aux_files(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get_switch_aux_file_policy() {
-            AuxFilePolicy::V1 => self.list_aux_files_v1(lsn, ctx).await,
-            AuxFilePolicy::V2 => self.list_aux_files_v2(lsn, ctx).await,
-            AuxFilePolicy::CrossValidation => {
-                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
-                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
-                match (v1_result, v2_result) {
-                    (Ok(v1), Ok(v2)) => {
-                        if v1 != v2 {
-                            tracing::error!(
-                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
-                            );
-                            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                                "unmatched aux file v1 v2 result"
-                            )));
-                        }
-                        Ok(v1)
-                    }
-                    (Ok(_), Err(v2)) => {
-                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
-                        Err(v2)
-                    }
-                    (Err(v1), Ok(_)) => {
-                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
-                        Err(v1)
-                    }
-                    (Err(_), Err(v2)) => Err(v2),
-                }
-            }
-        }
-    }
-
    /// Does the same as get_current_logical_size but counted on demand.
    /// Used to initialize the logical size tracking on startup.
    ///
@@ -789,13 +735,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    ///
-    /// The return value is (dense keyspace, sparse keyspace).
    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -867,18 +811,13 @@ impl Timeline {
        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
            result.add_key(AUX_FILES_KEY);
        }
-
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace::single(Key::metadata_aux_key_range())),
-        ))
+        Ok(result.to_keyspace())
    }

    /// Get cached size of relation if it not updated after specified LSN
    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
            if lsn >= *cached_lsn {
                return Some(*nblocks);
            }
@@ -889,16 +828,7 @@ impl Timeline {
    /// Update cached relation size if there is no more recent update
    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-
-        if lsn < rel_size_cache.complete_as_of {
-            // Do not cache old values. It's safe to cache the size on read, as long as
-            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
-            // never evict values from the cache, so if the relation size changed after
-            // 'lsn', the new value is already in the cache.
-            return;
-        }
-
-        match rel_size_cache.map.entry(tag) {
+        match rel_size_cache.entry(tag) {
            hash_map::Entry::Occupied(mut entry) => {
                let cached_lsn = entry.get_mut();
                if lsn >= cached_lsn.0 {
@@ -914,13 +844,13 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.insert(tag, (lsn, nblocks));
+        rel_size_cache.insert(tag, (lsn, nblocks));
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.map.remove(tag);
+        rel_size_cache.remove(tag);
    }
 }

@@ -1158,7 +1088,7 @@ impl<'a> DatadirModification<'a> {
    ) -> anyhow::Result<()> {
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1201,22 +1131,21 @@ impl<'a> DatadirModification<'a> {
        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
            .context("deserialize db")?;
        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
-            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
-                // Didn't exist. Update dbdir
-                e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
-                self.put(DBDIR_KEY, Value::Image(buf.into()));
+        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
+            // Didn't exist. Update dbdir
+            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
+            let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+            self.pending_directory_entries
+                .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+            self.put(DBDIR_KEY, Value::Image(buf.into()));

-                // and create the RelDirectory
-                RelDirectory::default()
-            } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
-            };
+            // and create the RelDirectory
+            RelDirectory::default()
+        } else {
+            // reldir already exists, fetch it
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
+        };

        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
@@ -1258,7 +1187,7 @@ impl<'a> DatadirModification<'a> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
        if self
            .tline
-            .get_rel_exists(rel, Version::Modified(self), ctx)
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
            .await?
        {
            let size_key = rel_size_to_key(rel);
@@ -1447,9 +1376,6 @@ impl<'a> DatadirModification<'a> {
    }

    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
-            return Ok(());
-        }
        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
            files: HashMap::new(),
        })?;
@@ -1465,122 +1391,86 @@ impl<'a> DatadirModification<'a> {
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let policy = self.tline.get_switch_aux_file_policy();
-        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
-            let key = aux_file::encode_aux_file_key(path);
-            // retrieve the key from the engine
-            let old_val = match self.get(key, ctx).await {
-                Ok(val) => Some(val),
-                Err(PageReconstructError::MissingKey(_)) => None,
-                Err(e) => return Err(e.into()),
-            };
-            let files = if let Some(ref old_val) = old_val {
-                aux_file::decode_file_value(old_val)?
-            } else {
-                Vec::new()
-            };
-            let new_files = if content.is_empty() {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .collect::<Vec<_>>()
-            } else {
-                files
-                    .into_iter()
-                    .filter(|(p, _)| &path != p)
-                    .chain(std::iter::once((path, content)))
-                    .collect::<Vec<_>>()
-            };
-            let new_val = aux_file::encode_file_value(&new_files)?;
-            self.put(key, Value::Image(new_val.into()));
-        }
+        let file_path = path.to_string();
+        let content = if content.is_empty() {
+            None
+        } else {
+            Some(Bytes::copy_from_slice(content))
+        };

-        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
-            let file_path = path.to_string();
-            let content = if content.is_empty() {
-                None
+        let n_files;
+        let mut aux_files = self.tline.aux_files.lock().await;
+        if let Some(mut dir) = aux_files.dir.take() {
+            // We already updated aux files in `self`: emit a delta and update our latest value
+            dir.upsert(file_path.clone(), content.clone());
+            n_files = dir.files.len();
+            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::Image(Bytes::from(
+                        AuxFilesDirectory::ser(&dir).context("serialize")?,
+                    )),
+                );
+                aux_files.n_deltas = 0;
            } else {
-                Some(Bytes::copy_from_slice(content))
-            };
+                self.put(
+                    AUX_FILES_KEY,
+                    Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
+                );
+                aux_files.n_deltas += 1;
+            }
+            aux_files.dir = Some(dir);
+        } else {
+            // Check if the AUX_FILES_KEY is initialized
+            match self.get(AUX_FILES_KEY, ctx).await {
+                Ok(dir_bytes) => {
+                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                    // Key is already set, we may append a delta
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile {
+                            file_path: file_path.clone(),
+                            content: content.clone(),
+                        }),
+                    );
+                    dir.upsert(file_path, content);
+                    n_files = dir.files.len();
+                    aux_files.dir = Some(dir);
+                }
+                Err(
+                    e @ (PageReconstructError::AncestorStopping(_)
+                    | PageReconstructError::Cancelled
+                    | PageReconstructError::AncestorLsnTimeout(_)),
+                ) => {
+                    // Important that we do not interpret a shutdown error as "not found" and thereby
+                    // reset the map.
+                    return Err(e.into());
+                }
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                    // Key is missing, we must insert an image as the basis for subsequent deltas.

-            let n_files;
-            let mut aux_files = self.tline.aux_files.lock().await;
-            if let Some(mut dir) = aux_files.dir.take() {
-                // We already updated aux files in `self`: emit a delta and update our latest value.
-                dir.upsert(file_path.clone(), content.clone());
-                n_files = dir.files.len();
-                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
+                    let mut dir = AuxFilesDirectory {
+                        files: HashMap::new(),
+                    };
+                    dir.upsert(file_path, content);
                    self.put(
                        AUX_FILES_KEY,
                        Value::Image(Bytes::from(
                            AuxFilesDirectory::ser(&dir).context("serialize")?,
                        )),
                    );
-                    aux_files.n_deltas = 0;
-                } else {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                    );
-                    aux_files.n_deltas += 1;
-                }
-                aux_files.dir = Some(dir);
-            } else {
-                // Check if the AUX_FILES_KEY is initialized
-                match self.get(AUX_FILES_KEY, ctx).await {
-                    Ok(dir_bytes) => {
-                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
-                        // Key is already set, we may append a delta
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::WalRecord(NeonWalRecord::AuxFile {
-                                file_path: file_path.clone(),
-                                content: content.clone(),
-                            }),
-                        );
-                        dir.upsert(file_path, content);
-                        n_files = dir.files.len();
-                        aux_files.dir = Some(dir);
-                    }
-                    Err(
-                        e @ (PageReconstructError::AncestorStopping(_)
-                        | PageReconstructError::Cancelled
-                        | PageReconstructError::AncestorLsnTimeout(_)),
-                    ) => {
-                        // Important that we do not interpret a shutdown error as "not found" and thereby
-                        // reset the map.
-                        return Err(e.into());
-                    }
-                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                    // the same for now, though in theory, we should only match the `MissingKey` variant.
-                    Err(
-                        PageReconstructError::Other(_)
-                        | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey { .. },
-                    ) => {
-                        // Key is missing, we must insert an image as the basis for subsequent deltas.
-
-                        let mut dir = AuxFilesDirectory {
-                            files: HashMap::new(),
-                        };
-                        dir.upsert(file_path, content);
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::Image(Bytes::from(
-                                AuxFilesDirectory::ser(&dir).context("serialize")?,
-                            )),
-                        );
-                        n_files = 1;
-                        aux_files.dir = Some(dir);
-                    }
+                    n_files = 1;
+                    aux_files.dir = Some(dir);
                }
            }
-
-            self.pending_directory_entries
-                .push((DirectoryKind::AuxFiles, n_files));
        }

+        self.pending_directory_entries
+            .push((DirectoryKind::AuxFiles, n_files));
+
        Ok(())
    }

@@ -1651,8 +1541,6 @@ impl<'a> DatadirModification<'a> {
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let mut writer = self.tline.writer().await;

-        let timer = WAL_INGEST.time_spent_on_ingest.start_timer();
-
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

@@ -1692,8 +1580,6 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

-        timer.observe_duration();
-
        Ok(())
    }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,7 @@ impl Value {
    }
 }

+#[cfg(test)]
 #[derive(Debug, PartialEq)]
 pub(crate) enum InvalidInput {
    TooShortValue,
@@ -41,8 +42,10 @@ pub(crate) enum InvalidInput {

 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
+#[cfg(test)]
 pub(crate) struct ValueBytes;

+#[cfg(test)]
 impl ValueBytes {
    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
        if raw.len() < 12 {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -319,9 +319,6 @@ pub enum TaskKind {
    // Eviction. One per timeline.
    Eviction,

-    // Ingest housekeeping (flushing ephemeral layers on time threshold or disk pressure)
-    IngestHousekeeping,
-
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

@@ -364,14 +361,8 @@ pub enum TaskKind {

    DebugTool,

-    EphemeralFilePreWarmPageCache,
-
-    LayerDownload,
-
    #[cfg(test)]
    UnitTest,
-
-    DetachAncestor,
 }

 #[derive(Default)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -64,7 +64,6 @@ use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
-use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -87,6 +86,7 @@ use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::InitializationOrder;
+use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
@@ -322,9 +322,6 @@ pub struct Tenant {
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) timeline_get_throttle:
        Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
-
-    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
-    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 }

 impl std::fmt::Debug for Tenant {
@@ -562,10 +559,9 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
-            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
        }

        timeline
@@ -891,7 +887,7 @@ impl Tenant {

    #[instrument(skip_all)]
    pub(crate) async fn preload(
-        self: &Arc<Self>,
+        self: &Arc<Tenant>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<TenantPreload> {
@@ -921,13 +917,9 @@ impl Tenant {

        Ok(TenantPreload {
            deleting,
-            timelines: Self::load_timeline_metadata(
-                self,
-                remote_timeline_ids,
-                remote_storage,
-                cancel,
-            )
-            .await?,
+            timelines: self
+                .load_timeline_metadata(remote_timeline_ids, remote_storage, cancel)
+                .await?,
        })
    }

@@ -1679,34 +1671,6 @@ impl Tenant {
        Ok(())
    }

-    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
-    // this happens during ingest: this background housekeeping is for freezing layers
-    // that are open but haven't been written to for some time.
-    async fn ingest_housekeeping(&self) {
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // compactions.  We don't want to block everything else while the
-        // compaction runs.
-        let timelines = {
-            self.timelines
-                .lock()
-                .unwrap()
-                .values()
-                .filter_map(|timeline| {
-                    if timeline.is_active() {
-                        Some(timeline.clone())
-                    } else {
-                        None
-                    }
-                })
-                .collect::<Vec<_>>()
-        };
-
-        for timeline in &timelines {
-            timeline.maybe_freeze_ephemeral_layer().await;
-        }
-    }
-
    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }
@@ -2560,7 +2524,6 @@ impl Tenant {
                &crate::metrics::tenant_throttling::TIMELINE_GET,
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
-            ongoing_timeline_detach: std::sync::Mutex::default(),
        }
    }

@@ -2844,48 +2807,7 @@ impl Tenant {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
-        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
-        // currently visible timelines.
-        let timelines = self
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tl| match target_timeline_id.as_ref() {
-                Some(target) => &tl.timeline_id == target,
-                None => true,
-            })
-            .cloned()
-            .collect::<Vec<_>>();
-
-        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
-            HashMap::with_capacity(timelines.len());
-
-        for timeline in timelines.iter() {
-            let cutoff = timeline
-                .get_last_record_lsn()
-                .checked_sub(horizon)
-                .unwrap_or(Lsn(0));
-
-            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
-
-            match res {
-                Ok(cutoffs) => {
-                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-                    assert!(old.is_none());
-                }
-                Err(e) => {
-                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
-                }
-            }
-        }
-
-        if !self.is_active() {
-            anyhow::bail!("shutting down");
-        }
-
-        // grab mutex to prevent new timelines from being created here; avoid doing long operations
-        // because that will stall branch creation.
+        // grab mutex to prevent new timelines from being created here.
        let gc_cs = self.gc_cs.lock().await;

        // Scan all timelines. For each timeline, remember the timeline ID and
@@ -2947,36 +2869,20 @@ impl Tenant {
                }
            }

-            let branchpoints: Vec<Lsn> = all_branchpoints
-                .range((
-                    Included((timeline_id, Lsn(0))),
-                    Included((timeline_id, Lsn(u64::MAX))),
-                ))
-                .map(|&x| x.1)
-                .collect();
+            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
+                let branchpoints: Vec<Lsn> = all_branchpoints
+                    .range((
+                        Included((timeline_id, Lsn(0))),
+                        Included((timeline_id, Lsn(u64::MAX))),
+                    ))
+                    .map(|&x| x.1)
+                    .collect();
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                    .await?;

-            {
-                let mut target = timeline.gc_info.write().unwrap();
-
-                match gc_cutoffs.remove(&timeline_id) {
-                    Some(cutoffs) => {
-                        *target = GcInfo {
-                            retain_lsns: branchpoints,
-                            cutoffs,
-                        };
-                    }
-                    None => {
-                        // reasons for this being unavailable:
-                        // - this timeline was created while we were finding cutoffs
-                        // - lsn for timestamp search fails for this timeline repeatedly
-                        //
-                        // in both cases, refreshing the branchpoints is correct.
-                        target.retain_lsns = branchpoints;
-                    }
-                };
+                gc_timelines.push(timeline);
            }
-
-            gc_timelines.push(timeline);
        }
        drop(gc_cs);
        Ok(gc_timelines)
@@ -3063,7 +2969,7 @@ impl Tenant {
        // and then the planned GC cutoff
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
-            let cutoff = gc_info.min_cutoff();
+            let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
            if start_lsn < cutoff {
                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
@@ -3121,7 +3027,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)
+                .schedule_index_upload_for_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }

@@ -3492,11 +3398,7 @@ impl Tenant {
        // is in progress (which is not a common case).
        //
        // See more for on the issue #2748 condenced out of the initial PR review.
-        let mut shared_cache = tokio::select! {
-            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
-        };
+        let mut shared_cache = self.cached_logical_sizes.lock().await;

        size::gather_inputs(
            self,
@@ -3758,7 +3660,6 @@ pub(crate) mod harness {
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
-                switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
            }
        }
    }
@@ -3957,11 +3858,8 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
-    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
-    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};

    static TEST_KEY: Lazy<Key> =
@@ -4599,25 +4497,11 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
-        timeline: &Arc<Timeline>,
-        ctx: &RequestContext,
-        lsn: Lsn,
-        repeat: usize,
-        key_count: usize,
-    ) -> anyhow::Result<()> {
-        let compact = true;
-        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
-    }
-
-    async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
-        timeline: &Arc<Timeline>,
+        timeline: Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
-        compact: bool,
    ) -> anyhow::Result<()> {
        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;
@@ -4625,8 +4509,6 @@ mod tests {
        // Enforce that key range is monotonously increasing
        let mut keyspace = KeySpaceAccum::new();

-        let cancel = CancellationToken::new();
-
        for _ in 0..repeat {
            for _ in 0..key_count {
                test_key.field6 = blknum;
@@ -4648,19 +4530,22 @@ mod tests {
                blknum += 1;
            }

-            timeline.freeze_and_flush().await?;
-            if compact {
-                // this requires timeline to be &Arc<Timeline>
-                timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
-            }
+            let cutoff = timeline.get_last_record_lsn();

-            // this doesn't really need to use the timeline_id target, but it is closer to what it
-            // originally was.
-            let res = tenant
-                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
+            timeline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    ctx,
+                )
                .await?;
-
-            assert_eq!(res.layers_removed, 0, "this never removes anything");
+            timeline.freeze_and_flush().await?;
+            timeline
+                .compact(&CancellationToken::new(), EnumSet::empty(), ctx)
+                .await?;
+            timeline.gc().await?;
        }

        Ok(())
@@ -4679,7 +4564,7 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        Ok(())
    }
@@ -4710,7 +4595,7 @@ mod tests {
            .await?;

        let lsn = Lsn(0x10);
-        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
        guard.layer_map().dump(true, &ctx).await?;
@@ -4763,9 +4648,7 @@ mod tests {
        for read in reads {
            info!("Doing vectored read on {:?}", read);

-            let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
-                .await;
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
            tline
                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                .await;
@@ -4774,59 +4657,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
-
-        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        let tline = tline.raw_timeline().unwrap();
-
-        let mut modification = tline.begin_modification(Lsn(0x1000));
-        modification.put_file("foo/bar1", b"content1", &ctx).await?;
-        modification.set_lsn(Lsn(0x1008))?;
-        modification.put_file("foo/bar2", b"content2", &ctx).await?;
-        modification.commit(&ctx).await?;
-
-        let child_timeline_id = TimelineId::generate();
-        tenant
-            .branch_timeline_test(
-                tline,
-                child_timeline_id,
-                Some(tline.get_last_record_lsn()),
-                &ctx,
-            )
-            .await?;
-
-        let child_timeline = tenant
-            .get_timeline(child_timeline_id, true)
-            .expect("Should have the branched timeline");
-
-        let aux_keyspace = KeySpace {
-            ranges: vec![NON_INHERITED_RANGE],
-        };
-        let read_lsn = child_timeline.get_last_record_lsn();
-
-        let vectored_res = child_timeline
-            .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
-            .await;
-
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
-        let images = vectored_res?;
-        assert!(images.is_empty());
-        Ok(())
-    }
-
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
@@ -4955,12 +4785,7 @@ mod tests {
            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
        };
        let results = child_timeline
-            .get_vectored_impl(
-                read.clone(),
-                current_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
            .await?;

        for (key, img_res) in results {
@@ -5093,7 +4918,6 @@ mod tests {
                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                    },
                    query_lsn,
-                    ValuesReconstructState::new(),
                    &ctx,
                )
                .await;
@@ -5134,29 +4958,13 @@ mod tests {

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
-            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_random_updates_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_random_updates_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_random_updates")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 1000;
-        let cancel = CancellationToken::new();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();

@@ -5215,11 +5023,22 @@ mod tests {
                );
            }

-            // Perform a cycle of flush, and GC
-            tline.freeze_and_flush().await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+            // Perform a cycle of flush, compact, and GC
+            let cutoff = tline.get_last_record_lsn();
+            tline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
                .await?;
+            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
+            tline.gc().await?;
        }

        Ok(())
@@ -5240,8 +5059,6 @@ mod tests {

        let mut keyspace = KeySpaceAccum::new();

-        let cancel = CancellationToken::new();
-
        // Track when each page was last modified. Used to assert that
        // a read sees the latest page version.
        let mut updated = [Lsn(0); NUM_KEYS];
@@ -5305,11 +5122,21 @@ mod tests {
            }

            // Perform a cycle of flush, compact, and GC
-            tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+            let cutoff = tline.get_last_record_lsn();
+            tline
+                .update_gc_info(
+                    Vec::new(),
+                    cutoff,
+                    Duration::ZERO,
+                    &CancellationToken::new(),
+                    &ctx,
+                )
                .await?;
+            tline.freeze_and_flush().await?;
+            tline
+                .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
+                .await?;
+            tline.gc().await?;
        }

        Ok(())
@@ -5491,140 +5318,19 @@ mod tests {

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
-        let names_algorithms = [
-            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
-            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
-        ];
-        for (name, algorithm) in names_algorithms {
-            test_read_at_max_lsn_algorithm(name, algorithm).await?;
-        }
-        Ok(())
-    }
-
-    async fn test_read_at_max_lsn_algorithm(
-        name: &'static str,
-        compaction_algorithm: CompactionAlgorithm,
-    ) -> anyhow::Result<()> {
-        let mut harness = TenantHarness::create(name)?;
-        harness.tenant_conf.compaction_algorithm = compaction_algorithm;
+        let harness = TenantHarness::create("test_read_at_max_lsn")?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
-        let compact = false;
-        bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;
+        bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

-        let result = tline.get(test_key, read_lsn, &ctx).await;
-        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_metadata_scan() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_scan")?;
-        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-
-        const NUM_KEYS: usize = 1000;
-        const STEP: usize = 100; // random update + scan base_key + idx * STEP
-
-        let cancel = CancellationToken::new();
-
-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
-        let mut test_key = base_key;
-
-        // Track when each page was last modified. Used to assert that
-        // a read sees the latest page version.
-        let mut updated = [Lsn(0); NUM_KEYS];
-
-        let mut lsn = Lsn(0x10);
-        #[allow(clippy::needless_range_loop)]
-        for blknum in 0..NUM_KEYS {
-            lsn = Lsn(lsn.0 + 0x10);
-            test_key.field6 = (blknum * STEP) as u32;
-            let mut writer = tline.writer().await;
-            writer
-                .put(
-                    test_key,
-                    lsn,
-                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
-                    &ctx,
-                )
-                .await?;
-            writer.finish_write(lsn);
-            updated[blknum] = lsn;
-            drop(writer);
-        }
-
-        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
-
-        for _ in 0..10 {
-            // Read all the blocks
-            for (blknum, last_lsn) in updated.iter().enumerate() {
-                test_key.field6 = (blknum * STEP) as u32;
-                assert_eq!(
-                    tline.get(test_key, lsn, &ctx).await?,
-                    test_img(&format!("{} at {}", blknum, last_lsn))
-                );
-            }
-
-            let mut cnt = 0;
-            for (key, value) in tline
-                .get_vectored_impl(
-                    keyspace.clone(),
-                    lsn,
-                    ValuesReconstructState::default(),
-                    &ctx,
-                )
-                .await?
-            {
-                let blknum = key.field6 as usize;
-                let value = value?;
-                assert!(blknum % STEP == 0);
-                let blknum = blknum / STEP;
-                assert_eq!(
-                    value,
-                    test_img(&format!("{} at {}", blknum, updated[blknum]))
-                );
-                cnt += 1;
-            }
-
-            assert_eq!(cnt, NUM_KEYS);
-
-            for _ in 0..NUM_KEYS {
-                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
-                test_key.field6 = (blknum * STEP) as u32;
-                let mut writer = tline.writer().await;
-                writer
-                    .put(
-                        test_key,
-                        lsn,
-                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
-                        &ctx,
-                    )
-                    .await?;
-                writer.finish_write(lsn);
-                drop(writer);
-                updated[blknum] = lsn;
-            }
-
-            // Perform a cycle of flush, compact, and GC
-            tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
-        }
+        assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());

        Ok(())
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        self.offset
    }

-    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
+    const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };

    /// Writes the given buffer directly to the underlying `VirtualFile`.
    /// You need to make sure that the internal buffer is empty, otherwise
@@ -130,9 +130,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    async fn write_all_unbuffered<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        src_buf: B,
-        ctx: &RequestContext,
    ) -> (B::Buf, Result<(), Error>) {
-        let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
+        let (src_buf, res) = self.inner.write_all(src_buf).await;
        let nbytes = match res {
            Ok(nbytes) => nbytes,
            Err(e) => return (src_buf, Err(e)),
@@ -143,9 +142,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {

    #[inline(always)]
    /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
+    pub async fn flush_buffer(&mut self) -> Result<(), Error> {
        let buf = std::mem::take(&mut self.buf);
-        let (mut buf, res) = self.inner.write_all(buf, ctx).await;
+        let (mut buf, res) = self.inner.write_all(buf).await;
        res?;
        buf.clear();
        self.buf = buf;
@@ -166,11 +165,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        src_buf: B,
-        ctx: &RequestContext,
    ) -> (B::Buf, Result<(), Error>) {
        if !BUFFERED {
            assert!(self.buf.is_empty());
-            return self.write_all_unbuffered(src_buf, ctx).await;
+            return self.write_all_unbuffered(src_buf).await;
        }
        let remaining = Self::CAPACITY - self.buf.len();
        let src_buf_len = src_buf.bytes_init();
@@ -185,7 +183,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        }
        // Then, if the buffer is full, flush it out
        if self.buf.len() == Self::CAPACITY {
-            if let Err(e) = self.flush_buffer(ctx).await {
+            if let Err(e) = self.flush_buffer().await {
                return (Slice::into_inner(src_buf), Err(e));
            }
        }
@@ -201,7 +199,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                assert_eq!(copied, src_buf.len());
                Slice::into_inner(src_buf)
            } else {
-                let (src_buf, res) = self.write_all_unbuffered(src_buf, ctx).await;
+                let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
                if let Err(e) = res {
                    return (src_buf, Err(e));
                }
@@ -218,7 +216,6 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    pub async fn write_blob<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
        &mut self,
        srcbuf: B,
-        ctx: &RequestContext,
    ) -> (B::Buf, Result<u64, Error>) {
        let offset = self.offset;

@@ -230,7 +227,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            if len < 128 {
                // Short blob. Write a 1-byte length header
                io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                self.write_all(io_buf).await
            } else {
                // Write a 4-byte length header
                if len > 0x7fff_ffff {
@@ -245,7 +242,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                let mut len_buf = (len as u32).to_be_bytes();
                len_buf[0] |= 0x80;
                io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                self.write_all(io_buf).await
            }
        }
        .await;
@@ -254,7 +251,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            Ok(_) => (),
            Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
        }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = self.write_all(srcbuf).await;
        (srcbuf, res.map(|_| offset))
    }
 }
@@ -264,8 +261,8 @@ impl BlobWriter<true> {
    ///
    /// This function flushes the internal buffer before giving access
    /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
-        self.flush_buffer(ctx).await?;
+    pub async fn into_inner(mut self) -> Result<VirtualFile, Error> {
+        self.flush_buffer().await?;
        Ok(self.inner)
    }

@@ -302,16 +299,16 @@ mod tests {
            let file = VirtualFile::create(pathbuf.as_path()).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = wtr.write_blob(blob.clone()).await;
                let offs = res?;
                offsets.push(offs);
            }
            // Write out one page worth of zeros so that we can
            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
+            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(&ctx).await?;
+            wtr.flush_buffer().await?;
        }

        let file = VirtualFile::open(pathbuf.as_path()).await?;
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,6 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
@@ -370,10 +369,6 @@ pub struct TenantConf {
    // How much WAL must be ingested before checking again whether a new image layer is required.
    // Expresed in multiples of checkpoint distance.
    pub image_layer_creation_check_threshold: u8,
-
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    pub switch_aux_file_policy: AuxFilePolicy,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -469,10 +464,6 @@ pub struct TenantConfOpt {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_layer_creation_check_threshold: Option<u8>,
-
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
 }

 impl TenantConfOpt {
@@ -530,9 +521,6 @@ impl TenantConfOpt {
            image_layer_creation_check_threshold: self
                .image_layer_creation_check_threshold
                .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_aux_file_policy: self
-                .switch_aux_file_policy
-                .unwrap_or(global_conf.switch_aux_file_policy),
        }
    }
 }
@@ -574,7 +562,6 @@ impl Default for TenantConf {
            lazy_slru_download: false,
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: AuxFilePolicy::V1,
        }
    }
 }
@@ -649,7 +636,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
            lazy_slru_download: value.lazy_slru_download,
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_aux_file_policy: value.switch_aux_file_policy,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -585,20 +585,9 @@ impl DeleteTenantFlow {

                    // FIXME: we should not be modifying this from outside of mgr.rs.
                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-
-                    // Update stats
-                    match &removed {
-                        TenantsMapRemoveResult::Occupied(slot) => {
-                            crate::metrics::TENANT_MANAGER.slot_removed(slot);
-                        }
-                        TenantsMapRemoveResult::InProgress(barrier) => {
-                            crate::metrics::TENANT_MANAGER
-                                .slot_removed(&TenantSlot::InProgress(barrier.clone()));
-                        }
-                        TenantsMapRemoveResult::Vacant => {
-                            // Nothing changed in map, no metric update
-                        }
-                    }
+                    crate::metrics::TENANT_MANAGER
+                        .tenant_slots
+                        .set(locked.len() as u64);

                    match removed {
                        TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,26 +3,36 @@

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache;
+use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use std::cmp::min;

-use std::io;
+use std::io::{self, ErrorKind};
+use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
+use tracing::*;
 use utils::id::TimelineId;

 pub struct EphemeralFile {
+    page_cache_file_id: page_cache::FileId,
+
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    file: VirtualFile,
+    len: u64,
+    /// An ephemeral file is append-only.
+    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
+    /// The other pages, which can no longer be modified, are accessed through the page cache.
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }

-mod page_caching;
-mod zero_padded_read_write;
-
 impl EphemeralFile {
    pub async fn create(
        conf: &PageServerConf,
@@ -49,18 +59,21 @@ impl EphemeralFile {
        .await?;

        Ok(EphemeralFile {
+            page_cache_file_id: page_cache::next_file_id(),
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            file,
+            len: 0,
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
        })
    }

    pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+        self.len
    }

-    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+    pub(crate) fn id(&self) -> page_cache::FileId {
+        self.page_cache_file_id
    }

    pub(crate) async fn read_blk(
@@ -68,7 +81,44 @@ impl EphemeralFile {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
+        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(write_guard) => {
+                    let write_guard = self
+                        .file
+                        .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
+        } else {
+            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
+        }
    }

    pub(crate) async fn write_blob(
@@ -76,22 +126,137 @@ impl EphemeralFile {
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                Ok(Writer {
+                    blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            async fn push_bytes(
+                &mut self,
+                src: &[u8],
+                ctx: &RequestContext,
+            ) -> Result<(), io::Error> {
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
+                            .ephemeral_file
+                            .file
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
+                            Ok(_) => {
+                                // Pre-warm the page cache with what we just wrote.
+                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
+                                let cache = page_cache::get();
+                                match cache
+                                    .read_immutable_buf(
+                                        self.ephemeral_file.page_cache_file_id,
+                                        self.blknum,
+                                        ctx,
+                                    )
+                                    .await
+                                {
+                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
+                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
+                                    }
+                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
+                                        let buf: &mut [u8] = write_guard.deref_mut();
+                                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
+                                        let _ = write_guard.mark_valid();
+                                        // pre-warm successful
+                                    }
+                                    Err(e) => {
+                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                                    }
+                                }
+                                // Zero the buffer for re-use.
+                                // Zeroing is critical for correcntess because the write_blob code below
+                                // and similarly read_blk expect zeroed pages.
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
+                                // This block is done, move to next one.
+                                self.blknum += 1;
+                                self.off = 0;
+                            }
+                            Err(e) => {
+                                return Err(std::io::Error::new(
+                                    ErrorKind::Other,
+                                    // order error before path because path is long and error is short
+                                    format!(
+                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
+                                        self.blknum,
+                                        e,
+                                        self.ephemeral_file.file.path,
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let pos = self.len;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
            // short one-byte length header
            let len_buf = [srcbuf.len() as u8];
-
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
+            writer.push_bytes(&len_buf, ctx).await?;
        }

        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
+        writer.push_bytes(srcbuf, ctx).await?;
+
+        if srcbuf.len() < 0x80 {
+            self.len += 1;
+        } else {
+            self.len += 4;
+        }
+        self.len += srcbuf.len() as u64;

        Ok(pos)
    }
@@ -106,6 +271,28 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // There might still be pages in the [`crate::page_cache`] for this file.
+        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+
+        // unlink the file
+        let res = std::fs::remove_file(&self.file.path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.file.path, e
+                );
+            }
+        }
+    }
+}
+
 impl BlockReader for EphemeralFile {
    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,223 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::VirtualFile;
-
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-}
-
-impl RW {
-    pub fn new(file: VirtualFile) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-            )),
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf, ctx).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().file.path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
-            }
-        }
-    }
-}
-
-struct PreWarmingWriter {
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
-        Self {
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<
-        B: tokio_epoll_uring::BoundedBuf<Buf = Buf>,
-        Buf: tokio_epoll_uring::IoBuf + Send,
-    >(
-        &mut self,
-        buf: B,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, B::Buf)> {
-        let buf = buf.slice(..);
-        let saved_bounds = buf.bounds(); // save for reconstructing the Slice from iobuf after the IO is done
-        let check_bounds_stuff_works = if cfg!(test) && cfg!(debug_assertions) {
-            Some(buf.to_vec())
-        } else {
-            None
-        };
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let iobuf = match self.file.write_all(buf, ctx).await {
-            (iobuf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                iobuf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        // Reconstruct the Slice (the write path consumed the Slice and returned us the underlying IoBuf)
-        let buf = tokio_epoll_uring::Slice::from_buf_bounds(iobuf, saved_bounds);
-        if let Some(check_bounds_stuff_works) = check_bounds_stuff_works {
-            assert_eq!(&check_bounds_stuff_works, &*buf);
-        }
-
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
-                    }
-                },
-            }
-        }
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf.into_inner()))
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -1,130 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    context::RequestContext,
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(
-        &mut self,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -1,108 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> tokio_epoll_uring::Slice<Self> {
-        self.invariants();
-        let written = self.written;
-        tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -588,7 +588,7 @@ impl LayerMap {
            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
            coverage.push((kr, current_val.take()));
            current_key = change_key;
-            current_val.clone_from(&change_val);
+            current_val = change_val.clone();
        }

        // Add the final interval
@@ -672,12 +672,12 @@ impl LayerMap {
        // Loop through the delta coverage and recurse on each part
        for (change_key, change_val) in version.delta_coverage.range(start..end) {
            // If there's a relevant delta in this part, add 1 and recurse down
-            if let Some(val) = &current_val {
+            if let Some(val) = current_val {
                if val.get_lsn_range().end > lsn.start {
                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                    let lr = lsn.start..val.get_lsn_range().start;
                    if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(val, key) as usize;
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                        let new_limit = limit.map(|l| l - base_count);
                        let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                        max_stacked_deltas = std::cmp::max(
@@ -689,17 +689,17 @@ impl LayerMap {
            }

            current_key = change_key;
-            current_val.clone_from(&change_val);
+            current_val = change_val.clone();
        }

        // Consider the last part
-        if let Some(val) = &current_val {
+        if let Some(val) = current_val {
            if val.get_lsn_range().end > lsn.start {
                let kr = Key::from_i128(current_key)..Key::from_i128(end);
                let lr = lsn.start..val.get_lsn_range().start;

                if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(val, key) as usize;
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                    let new_limit = limit.map(|l| l - base_count);
                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                    max_stacked_deltas = std::cmp::max(
@@ -916,7 +916,6 @@ mod tests {
        assert_eq!(lhs, rhs);
    }

-    #[cfg(test)]
    fn brute_force_range_search(
        layer_map: &LayerMap,
        key_range: Range<Key>,
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -207,24 +207,6 @@ impl TimelineMetadata {
        self.body.ancestor_lsn
    }

-    /// When reparenting, the `ancestor_lsn` does not change.
-    pub fn reparent(&mut self, timeline: &TimelineId) {
-        assert!(self.body.ancestor_timeline.is_some());
-        // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
-        self.body.ancestor_timeline = Some(*timeline);
-    }
-
-    pub fn detach_from_ancestor(&mut self, timeline: &TimelineId, ancestor_lsn: &Lsn) {
-        if let Some(ancestor) = self.body.ancestor_timeline {
-            assert_eq!(ancestor, *timeline);
-        }
-        if self.body.ancestor_lsn != Lsn(0) {
-            assert_eq!(self.body.ancestor_lsn, *ancestor_lsn);
-        }
-        self.body.ancestor_timeline = None;
-        self.body.ancestor_lsn = Lsn(0);
-    }
-
    pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
        self.body.latest_gc_cutoff_lsn
    }
@@ -253,12 +235,6 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
-
-    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
-        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
-        self.body.prev_record_lsn = update.prev_record_lsn;
-        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
-    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -283,27 +259,6 @@ impl Serialize for TimelineMetadata {
    }
 }

-/// Parts of the metadata which are regularly modified.
-pub(crate) struct MetadataUpdate {
-    disk_consistent_lsn: Lsn,
-    prev_record_lsn: Option<Lsn>,
-    latest_gc_cutoff_lsn: Lsn,
-}
-
-impl MetadataUpdate {
-    pub(crate) fn new(
-        disk_consistent_lsn: Lsn,
-        prev_record_lsn: Option<Lsn>,
-        latest_gc_cutoff_lsn: Lsn,
-    ) -> Self {
-        Self {
-            disk_consistent_lsn,
-            prev_record_lsn,
-            latest_gc_cutoff_lsn,
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,6 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::LocationConfigMode;
@@ -56,7 +55,6 @@ use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -247,7 +245,6 @@ impl TenantsMap {
        }
    }

-    #[cfg(all(debug_assertions, not(test)))]
    pub(crate) fn len(&self) -> usize {
        match self {
            TenantsMap::Initializing => 0,
@@ -256,15 +253,17 @@ impl TenantsMap {
    }
 }

-/// Precursor to deletion of a tenant dir: we do a fast rename to a tmp path, and then
-/// the slower actual deletion in the background.
-///
 /// This is "safe" in that that it won't leave behind a partially deleted directory
 /// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
 /// the contents.
 ///
 /// This is pageserver-specific, as it relies on future processes after a crash to check
 /// for TEMP_FILE_SUFFIX when loading things.
+async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
+    let tmp_path = safe_rename_tenant_dir(path).await?;
+    fs::remove_dir_all(tmp_path).await
+}
+
 async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
    let parent = path
        .as_ref()
@@ -287,28 +286,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
    Ok(tmp_path)
 }

-/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
-/// the background, and thereby avoid blocking any API requests on this deletion completing.
-fn spawn_background_purge(tmp_path: Utf8PathBuf) {
-    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
-    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
-    let task_tenant_id = None;
-
-    task_mgr::spawn(
-        task_mgr::BACKGROUND_RUNTIME.handle(),
-        TaskKind::MgmtRequest,
-        task_tenant_id,
-        None,
-        "tenant_files_delete",
-        false,
-        async move {
-            fs::remove_dir_all(tmp_path.as_path())
-                .await
-                .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
-        },
-    );
-}
-
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

@@ -593,11 +570,7 @@ pub async fn init_tenant_mgr(
    );
    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

-    // Accumulate futures for writing tenant configs, so that we can execute in parallel
-    let mut config_write_futs = Vec::new();
-
-    // Update the location configs according to the re-attach response and persist them to disk
-    tracing::info!("Updating {} location configs", tenant_configs.len());
+    // Construct `Tenant` objects and start them running
    for (tenant_shard_id, location_conf) in tenant_configs {
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

@@ -624,22 +597,18 @@ pub async fn init_tenant_mgr(
        const DEFAULT_SECONDARY_CONF: SecondaryLocationConfig =
            SecondaryLocationConfig { warm: true };

+        // Update the location config according to the re-attach response
        if let Some(tenant_modes) = &tenant_modes {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            match tenant_modes.get(&tenant_shard_id) {
                None => {
                    info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
-
-                    match safe_rename_tenant_dir(&tenant_dir_path).await {
-                        Ok(tmp_path) => {
-                            spawn_background_purge(tmp_path);
-                        }
-                        Err(e) => {
-                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                            "Failed to move detached tenant directory '{tenant_dir_path}': {e:?}");
-                        }
-                    };
+                    if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
+                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
+                        );
+                    }

                    // We deleted local content: move on to next tenant, don't try and spawn this one.
                    continue;
@@ -685,32 +654,8 @@ pub async fn init_tenant_mgr(

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
-        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
-            (tenant_shard_id, location_conf, r)
-        });
-    }
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    // Execute config writes with concurrency, to avoid bottlenecking on local FS write latency
-    tracing::info!(
-        "Writing {} location config files...",
-        config_write_futs.len()
-    );
-    let config_write_results = futures::stream::iter(config_write_futs)
-        .buffer_unordered(16)
-        .collect::<Vec<_>>()
-        .await;
-
-    tracing::info!(
-        "Spawning {} tenant shard locations...",
-        config_write_results.len()
-    );
-    // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
-    for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
-
-        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
            LocationMode::Attached(attached_conf) => {
@@ -748,7 +693,6 @@ pub async fn init_tenant_mgr(
            }
        };

-        METRICS.slot_inserted(&slot);
        tenants.insert(tenant_shard_id, slot);
    }

@@ -756,7 +700,7 @@ pub async fn init_tenant_mgr(

    let mut tenants_map = TENANTS.write().unwrap();
    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
-
+    METRICS.tenant_slots.set(tenants.len() as u64);
    *tenants_map = TenantsMap::Open(tenants);

    Ok(TenantManager {
@@ -827,14 +771,6 @@ fn tenant_spawn(
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    let mut join_set = JoinSet::new();

-    #[cfg(all(debug_assertions, not(test)))]
-    {
-        // Check that our metrics properly tracked the size of the tenants map.  This is a convenient location to check,
-        // as it happens implicitly at the end of tests etc.
-        let m = tenants.read().unwrap();
-        debug_assert_eq!(METRICS.slots_total(), m.len() as u64);
-    }
-
    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
    let (total_in_progress, total_attached) = {
        let mut m = tenants.write().unwrap();
@@ -1763,7 +1699,7 @@ impl TenantManager {
        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
            "failpoint"
@@ -1918,6 +1854,28 @@ impl TenantManager {
        shutdown_all_tenants0(self.tenants).await
    }

+    /// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
+    /// the background, and thereby avoid blocking any API requests on this deletion completing.
+    fn spawn_background_purge(&self, tmp_path: Utf8PathBuf) {
+        // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
+        // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
+        let task_tenant_id = None;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            task_tenant_id,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+    }
+
    pub(crate) async fn detach_tenant(
        &self,
        conf: &'static PageServerConf,
@@ -1934,7 +1892,7 @@ impl TenantManager {
                deletion_queue_client,
            )
            .await?;
-        spawn_background_purge(tmp_path);
+        self.spawn_background_purge(tmp_path);

        Ok(())
    }
@@ -2008,101 +1966,6 @@ impl TenantManager {
            })
            .collect())
    }
-
-    /// Completes an earlier prepared timeline detach ancestor.
-    pub(crate) async fn complete_detaching_timeline_ancestor(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        prepared: PreparedTimelineDetach,
-        ctx: &RequestContext,
-    ) -> Result<Vec<TimelineId>, anyhow::Error> {
-        struct RevertOnDropSlot(Option<SlotGuard>);
-
-        impl Drop for RevertOnDropSlot {
-            fn drop(&mut self) {
-                if let Some(taken) = self.0.take() {
-                    taken.revert();
-                }
-            }
-        }
-
-        impl RevertOnDropSlot {
-            fn into_inner(mut self) -> SlotGuard {
-                self.0.take().unwrap()
-            }
-        }
-
-        impl std::ops::Deref for RevertOnDropSlot {
-            type Target = SlotGuard;
-
-            fn deref(&self) -> &Self::Target {
-                self.0.as_ref().unwrap()
-            }
-        }
-
-        let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let slot_guard = RevertOnDropSlot(Some(slot_guard));
-
-        let tenant = {
-            let Some(old_slot) = slot_guard.get_old_value() else {
-                anyhow::bail!(
-                    "Tenant not found when trying to complete detaching timeline ancestor"
-                );
-            };
-
-            let Some(tenant) = old_slot.get_attached() else {
-                anyhow::bail!("Tenant is not in attached state");
-            };
-
-            if !tenant.is_active() {
-                anyhow::bail!("Tenant is not active");
-            }
-
-            tenant.clone()
-        };
-
-        let timeline = tenant.get_timeline(timeline_id, true)?;
-
-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
-            .await?;
-
-        let mut slot_guard = slot_guard.into_inner();
-
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
-            }
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            self.tenants,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -2534,13 +2397,10 @@ impl SlotGuard {
                TenantsMap::Open(m) => m,
            };

-            METRICS.slot_inserted(&new_value);
-
            let replaced = m.insert(self.tenant_shard_id, new_value);
            self.upserted = true;
-            if let Some(replaced) = replaced.as_ref() {
-                METRICS.slot_removed(replaced);
-            }
+
+            METRICS.tenant_slots.set(m.len() as u64);

            replaced
        };
@@ -2650,13 +2510,9 @@ impl Drop for SlotGuard {
                }

                if self.old_value_is_shutdown() {
-                    METRICS.slot_removed(entry.get());
                    entry.remove();
                } else {
-                    let inserting = self.old_value.take().unwrap();
-                    METRICS.slot_inserted(&inserting);
-                    let replaced = entry.insert(inserting);
-                    METRICS.slot_removed(&replaced);
+                    entry.insert(self.old_value.take().unwrap());
                }
            }
            Entry::Vacant(_) => {
@@ -2667,6 +2523,8 @@ impl Drop for SlotGuard {
                );
            }
        }
+
+        METRICS.tenant_slots.set(m.len() as u64);
    }
 }

@@ -2746,9 +2604,7 @@ fn tenant_map_acquire_slot_impl(
            }
            _ => {
                let (completion, barrier) = utils::completion::channel();
-                let inserting = TenantSlot::InProgress(barrier);
-                METRICS.slot_inserted(&inserting);
-                v.insert(inserting);
+                v.insert(TenantSlot::InProgress(barrier));
                tracing::debug!("Vacant, inserted InProgress");
                Ok(SlotGuard::new(*tenant_shard_id, None, completion))
            }
@@ -2784,10 +2640,7 @@ fn tenant_map_acquire_slot_impl(
                _ => {
                    // Happy case: the slot was not in any state that violated our mode
                    let (completion, barrier) = utils::completion::channel();
-                    let in_progress = TenantSlot::InProgress(barrier);
-                    METRICS.slot_inserted(&in_progress);
-                    let old_value = o.insert(in_progress);
-                    METRICS.slot_removed(&old_value);
+                    let old_value = o.insert(TenantSlot::InProgress(barrier));
                    tracing::debug!("Occupied, replaced with InProgress");
                    Ok(SlotGuard::new(
                        *tenant_shard_id,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,15 +202,12 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;

-use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
-};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
@@ -239,14 +236,11 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

-use super::metadata::MetadataUpdate;
-use super::storage_layer::{Layer, LayerName, ResidentLayer};
+use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;

-pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
-};
+pub(crate) use download::{is_temp_download_file, list_remote_timelines};
 pub(crate) use index::LayerFileMetadata;

 // Occasional network issues and such can cause remote operations to fail, and
@@ -475,7 +469,7 @@ impl RemoteTimelineClient {
            },
        );

-        let (index_part, _index_generation) = download::download_index_part(
+        let index_part = download::download_index_part(
            &self.storage_impl,
            &self.tenant_shard_id,
            &self.timeline_id,
@@ -503,10 +497,9 @@ impl RemoteTimelineClient {
    /// On success, returns the size of the downloaded file.
    pub async fn download_layer_file(
        &self,
-        layer_file_name: &LayerName,
+        layer_file_name: &LayerFileName,
        layer_metadata: &LayerFileMetadata,
        cancel: &CancellationToken,
-        ctx: &RequestContext,
    ) -> anyhow::Result<u64> {
        let downloaded_size = {
            let _unfinished_gauge_guard = self.metrics.call_begin(
@@ -524,7 +517,6 @@ impl RemoteTimelineClient {
                layer_file_name,
                layer_metadata,
                cancel,
-                ctx,
            )
            .measure_remote_op(
                RemoteOpFileKind::Layer,
@@ -544,10 +536,9 @@ impl RemoteTimelineClient {
    // Upload operations.
    //

-    /// Launch an index-file upload operation in the background, with
-    /// fully updated metadata.
    ///
-    /// This should only be used to upload initial metadata to remote storage.
+    /// Launch an index-file upload operation in the background, with
+    /// updated metadata.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -559,7 +550,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_full_metadata_update(
+    pub fn schedule_index_upload_for_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -570,28 +561,7 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue);
-
-        Ok(())
-    }
-
-    /// Launch an index-file upload operation in the background, with only parts of the metadata
-    /// updated.
-    ///
-    /// This is the regular way of updating metadata on layer flushes or Gc.
-    ///
-    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
-    /// `index_part.json`, while being more clear on what values update regularly.
-    pub(crate) fn schedule_index_upload_for_metadata_update(
-        self: &Arc<Self>,
-        update: &MetadataUpdate,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-
-        upload_queue.latest_metadata.apply(update);
-
-        self.schedule_index_upload(upload_queue);
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());

        Ok(())
    }
@@ -611,14 +581,18 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
        }

        Ok(())
    }

    /// Launch an index-file upload operation in the background (internal function)
-    fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
+    fn schedule_index_upload(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        metadata: TimelineMetadata,
+    ) {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();

        info!(
@@ -627,7 +601,11 @@ impl RemoteTimelineClient {
            upload_queue.latest_files_changes_since_metadata_upload_scheduled,
        );

-        let index_part = IndexPart::from(&*upload_queue);
+        let index_part = IndexPart::new(
+            upload_queue.latest_files.clone(),
+            disk_consistent_lsn,
+            metadata,
+        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -637,61 +615,9 @@ impl RemoteTimelineClient {
        self.launch_queued_tasks(upload_queue);
    }

-    pub(crate) async fn schedule_reparenting_and_wait(
-        self: &Arc<Self>,
-        new_parent: &TimelineId,
-    ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
-        let receiver = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            upload_queue.latest_metadata.reparent(new_parent);
-
-            self.schedule_index_upload(upload_queue);
-
-            self.schedule_barrier0(upload_queue)
-        };
-
-        Self::wait_completion0(receiver).await
-    }
-
-    /// Schedules uploading a new version of `index_part.json` with the given layers added,
-    /// detaching from ancestor and waits for it to complete.
    ///
-    /// This is used with `Timeline::detach_ancestor` functionality.
-    pub(crate) async fn schedule_adding_existing_layers_to_index_detach_and_wait(
-        self: &Arc<Self>,
-        layers: &[Layer],
-        adopted: (TimelineId, Lsn),
-    ) -> anyhow::Result<()> {
-        let barrier = {
-            let mut guard = self.upload_queue.lock().unwrap();
-            let upload_queue = guard.initialized_mut()?;
-
-            upload_queue
-                .latest_metadata
-                .detach_from_ancestor(&adopted.0, &adopted.1);
-
-            for layer in layers {
-                upload_queue
-                    .latest_files
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
-            }
-
-            self.schedule_index_upload(upload_queue);
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
-        };
-
-        Self::wait_completion0(barrier).await
-    }
-
-    /// Launch an upload operation in the background; the file is added to be included in next
-    /// `index_part.json` upload.
+    /// Launch an upload operation in the background.
+    ///
    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
        layer: ResidentLayer,
@@ -713,15 +639,13 @@ impl RemoteTimelineClient {

        upload_queue
            .latest_files
-            .insert(layer.layer_desc().layer_name(), metadata.clone());
+            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

        info!(
-            gen=?metadata.generation,
-            shard=?metadata.shard,
-            "scheduled layer file upload {layer}",
+            "scheduled layer file upload {layer} gen={:?} shard={:?}",
+            metadata.generation, metadata.shard
        );
-
        let op = UploadOp::UploadLayer(layer, metadata);
        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -737,7 +661,7 @@ impl RemoteTimelineClient {
    /// successfully.
    pub fn schedule_layer_file_deletion(
        self: &Arc<Self>,
-        names: &[LayerName],
+        names: &[LayerFileName],
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -765,7 +689,7 @@ impl RemoteTimelineClient {
        // the layer files as "dangling". this is fine, at worst case we create work for the
        // scrubber.

-        let names = gc_layers.iter().map(|x| x.layer_desc().layer_name());
+        let names = gc_layers.iter().map(|x| x.layer_desc().filename());

        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);

@@ -780,10 +704,14 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> Vec<(LayerName, LayerFileMetadata)>
+    ) -> Vec<(LayerFileName, LayerFileMetadata)>
    where
-        I: IntoIterator<Item = LayerName>,
+        I: IntoIterator<Item = LayerFileName>,
    {
+        // Deleting layers doesn't affect the values stored in TimelineMetadata,
+        // so we don't need update it. Just serialize it.
+        let metadata = upload_queue.latest_metadata.clone();
+
        // Decorate our list of names with each name's metadata, dropping
        // names that are unexpectedly missing from our metadata.  This metadata
        // is later used when physically deleting layers, to construct key paths.
@@ -822,7 +750,7 @@ impl RemoteTimelineClient {
        // index_part update, because that needs to be uploaded before we can actually delete the
        // files.
        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue);
+            self.schedule_index_upload(upload_queue, metadata);
        }

        with_metadata
@@ -832,7 +760,7 @@ impl RemoteTimelineClient {
    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
    pub(crate) fn schedule_deletion_of_unlinked(
        self: &Arc<Self>,
-        layers: Vec<(LayerName, LayerFileMetadata)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -845,7 +773,7 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        mut with_metadata: Vec<(LayerName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
        // Filter out any layers which were not created by this tenant shard.  These are
        // layers that originate from some ancestor shard after a split, and may still
@@ -914,7 +842,7 @@ impl RemoteTimelineClient {
            self.schedule_layer_file_upload0(upload_queue, layer.clone());
        }

-        let names = compacted_from.iter().map(|x| x.layer_desc().layer_name());
+        let names = compacted_from.iter().map(|x| x.layer_desc().filename());

        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names);
        self.launch_queued_tasks(upload_queue);
@@ -924,18 +852,12 @@ impl RemoteTimelineClient {

    /// Wait for all previously scheduled uploads/deletions to complete
    pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let receiver = {
+        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
            self.schedule_barrier0(upload_queue)
        };

-        Self::wait_completion0(receiver).await
-    }
-
-    async fn wait_completion0(
-        mut receiver: tokio::sync::watch::Receiver<()>,
-    ) -> anyhow::Result<()> {
        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
        }
@@ -1051,7 +973,8 @@ impl RemoteTimelineClient {
            let deleted_at = Utc::now().naive_utc();
            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);

-            let mut index_part = IndexPart::from(&stopped.upload_queue_for_deletion);
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
+                .context("IndexPart serialize")?;
            index_part.deleted_at = Some(deleted_at);
            index_part
        };
@@ -1132,93 +1055,6 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    /// Uploads the given layer **without** adding it to be part of a future `index_part.json` upload.
-    ///
-    /// This is not normally needed.
-    pub(crate) async fn upload_layer_file(
-        self: &Arc<Self>,
-        uploaded: &ResidentLayer,
-        cancel: &CancellationToken,
-    ) -> anyhow::Result<()> {
-        let remote_path = remote_layer_path(
-            &self.tenant_shard_id.tenant_id,
-            &self.timeline_id,
-            self.tenant_shard_id.to_index(),
-            &uploaded.layer_desc().layer_name(),
-            uploaded.metadata().generation,
-        );
-
-        backoff::retry(
-            || async {
-                upload::upload_timeline_layer(
-                    &self.storage_impl,
-                    uploaded.local_path(),
-                    &remote_path,
-                    uploaded.metadata().file_size(),
-                    cancel,
-                )
-                .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "upload a layer without adding it to latest files",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("upload a layer without adding it to latest files")
-    }
-
-    /// Copies the `adopted` remote existing layer to the remote path of `adopted_as`. The layer is
-    /// not added to be part of a future `index_part.json` upload.
-    pub(crate) async fn copy_timeline_layer(
-        self: &Arc<Self>,
-        adopted: &Layer,
-        adopted_as: &Layer,
-        cancel: &CancellationToken,
-    ) -> anyhow::Result<()> {
-        let source_remote_path = remote_layer_path(
-            &self.tenant_shard_id.tenant_id,
-            &adopted
-                .get_timeline_id()
-                .expect("Source timeline should be alive"),
-            self.tenant_shard_id.to_index(),
-            &adopted.layer_desc().layer_name(),
-            adopted.metadata().generation,
-        );
-
-        let target_remote_path = remote_layer_path(
-            &self.tenant_shard_id.tenant_id,
-            &self.timeline_id,
-            self.tenant_shard_id.to_index(),
-            &adopted_as.layer_desc().layer_name(),
-            adopted_as.metadata().generation,
-        );
-
-        backoff::retry(
-            || async {
-                upload::copy_timeline_layer(
-                    &self.storage_impl,
-                    &source_remote_path,
-                    &target_remote_path,
-                    cancel,
-                )
-                .await
-            },
-            TimeoutOrCancel::caused_by_cancel,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "copy timeline layer",
-            cancel,
-        )
-        .await
-        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
-        .and_then(|x| x)
-        .context("remote copy timeline layer")
-    }
-
    async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> {
        match tokio::time::timeout(
            DELETION_QUEUE_FLUSH_TIMEOUT,
@@ -1286,7 +1122,7 @@ impl RemoteTimelineClient {
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
        self.flush_deletion_queue().await?;

@@ -1295,20 +1131,14 @@ impl RemoteTimelineClient {
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list(
-                        Some(&timeline_storage_path),
-                        ListingMode::NoDelimiter,
-                        None,
-                        &cancel,
-                    )
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                    .await
            },
            "list remaining files",
            &cancel,
        )
        .await
-        .context("list files remaining files")?
-        .keys;
+        .context("list files remaining files")?;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -1390,7 +1220,7 @@ impl RemoteTimelineClient {
        while let Some(next_op) = upload_queue.queued_operations.front() {
            // Can we run this task now?
            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
+                UploadOp::UploadLayer(_, _) => {
                    // Can always be scheduled.
                    true
                }
@@ -1517,25 +1347,13 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let local_path = layer.local_path();
-
-                    // We should only be uploading layers created by this `Tenant`'s lifetime, so
-                    // the metadata in the upload should always match our current generation.
-                    assert_eq!(layer_metadata.generation, self.generation);
-
-                    let remote_path = remote_layer_path(
-                        &self.tenant_shard_id.tenant_id,
-                        &self.timeline_id,
-                        layer_metadata.shard,
-                        &layer.layer_desc().layer_name(),
-                        layer_metadata.generation,
-                    );
-
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
+                        self.conf,
                        &self.storage_impl,
-                        local_path,
-                        &remote_path,
-                        layer_metadata.file_size(),
+                        path,
+                        layer_metadata,
+                        self.generation,
                        &self.cancel,
                    )
                    .measure_remote_op(
@@ -1867,11 +1685,6 @@ impl RemoteTimelineClient {
    }
 }

-pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    let path = format!("tenants/{tenant_shard_id}");
-    RemotePath::from_string(&path).expect("Failed to construct path")
-}
-
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
@@ -1896,14 +1709,14 @@ pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    shard: ShardIndex,
-    layer_file_name: &LayerName,
+    layer_file_name: &LayerFileName,
    generation: Generation,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
        "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
        shard.get_suffix(),
-        layer_file_name,
+        layer_file_name.file_name(),
        generation.get_suffix()
    );

@@ -1964,6 +1777,29 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
    }
 }

+/// Files on the remote storage are stored with paths, relative to the workdir.
+/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
+///
+/// Errors if the path provided does not start from pageserver's workdir.
+pub fn remote_path(
+    conf: &PageServerConf,
+    local_path: &Utf8Path,
+    generation: Generation,
+) -> anyhow::Result<RemotePath> {
+    let stripped = local_path
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")?;
+
+    let suffixed = format!("{0}{1}", stripped, generation.get_suffix());
+
+    RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
+        format!(
+            "to resolve remote part of path {:?} for base {:?}",
+            local_path, conf.workdir
+        )
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1971,7 +1807,6 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::layer::local_layer_path,
            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
@@ -2000,8 +1835,8 @@ mod tests {
        TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
    }

-    fn assert_file_list(a: &HashSet<LayerName>, b: &[&str]) {
-        let mut avec: Vec<String> = a.iter().map(|x| x.to_string()).collect();
+    fn assert_file_list(a: &HashSet<LayerFileName>, b: &[&str]) {
+        let mut avec: Vec<String> = a.iter().map(|x| x.file_name()).collect();
        avec.sort();

        let mut bvec = b.to_vec();
@@ -2127,7 +1962,7 @@ mod tests {
            .layer_metadata
            .keys()
            .map(|f| f.to_owned())
-            .collect::<HashSet<LayerName>>();
+            .collect::<HashSet<LayerFileName>>();
        let initial_layer = {
            assert!(initial_layers.len() == 1);
            initial_layers.into_iter().next().unwrap()
@@ -2153,21 +1988,12 @@ mod tests {
            ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59DA-00000000016B5A53".parse().unwrap(), dummy_contents("baz"))
        ]
        .into_iter()
-        .map(|(name, contents): (LayerName, Vec<u8>)| {
-
-            let local_path = local_layer_path(
-                harness.conf,
-                &timeline.tenant_shard_id,
-                &timeline.timeline_id,
-                &name,
-                &generation,
-            );
-            std::fs::write(&local_path, &contents).unwrap();
+        .map(|(name, contents): (LayerFileName, Vec<u8>)| {
+            std::fs::write(timeline_path.join(name.file_name()), &contents).unwrap();

            Layer::for_resident(
                harness.conf,
                &timeline,
-                local_path,
                name,
                LayerFileMetadata::new(contents.len() as u64, generation, shard),
            )
@@ -2198,7 +2024,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_full_metadata_update(&metadata)
+            .schedule_index_upload_for_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -2234,9 +2060,9 @@ mod tests {
                .map(|f| f.to_owned())
                .collect(),
            &[
-                &initial_layer.to_string(),
-                &layers[0].layer_desc().layer_name().to_string(),
-                &layers[1].layer_desc().layer_name().to_string(),
+                &initial_layer.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
            ],
        );
        assert_eq!(index_part.metadata, metadata);
@@ -2250,7 +2076,7 @@ mod tests {
        // keep using schedule_layer_file_deletion because we don't have a way to wait for the
        // spawn_blocking started by the drop.
        client
-            .schedule_layer_file_deletion(&[layers[0].layer_desc().layer_name()])
+            .schedule_layer_file_deletion(&[layers[0].layer_desc().filename()])
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
@@ -2268,9 +2094,9 @@ mod tests {
        }
        assert_remote_files(
            &[
-                &initial_layer.to_string(),
-                &layers[0].layer_desc().layer_name().to_string(),
-                &layers[1].layer_desc().layer_name().to_string(),
+                &initial_layer.file_name(),
+                &layers[0].layer_desc().filename().file_name(),
+                &layers[1].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -2283,9 +2109,9 @@ mod tests {

        assert_remote_files(
            &[
-                &initial_layer.to_string(),
-                &layers[1].layer_desc().layer_name().to_string(),
-                &layers[2].layer_desc().layer_name().to_string(),
+                &initial_layer.file_name(),
+                &layers[1].layer_desc().filename().file_name(),
+                &layers[2].layer_desc().filename().file_name(),
                "index_part.json",
            ],
            &remote_timeline_dir,
@@ -2304,22 +2130,19 @@ mod tests {
            ..
        } = TestSetup::new("metrics").await.unwrap();
        let client = timeline.remote_client.as_ref().unwrap();
+        let timeline_path = harness.timeline_path(&TIMELINE_ID);

-        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let local_path = local_layer_path(
-            harness.conf,
-            &timeline.tenant_shard_id,
-            &timeline.timeline_id,
-            &layer_file_name_1,
-            &harness.generation,
-        );
+        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let content_1 = dummy_contents("foo");
-        std::fs::write(&local_path, &content_1).unwrap();
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )
+        .unwrap();

        let layer_file_1 = Layer::for_resident(
            harness.conf,
            &timeline,
-            local_path,
            layer_file_name_1.clone(),
            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
        );
@@ -2388,7 +2211,12 @@ mod tests {

    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
        // An empty IndexPart, just sufficient to ensure deserialization will succeed
-        let example_index_part = IndexPart::example();
+        let example_metadata = TimelineMetadata::example();
+        let example_index_part = IndexPart::new(
+            HashMap::new(),
+            example_metadata.disk_consistent_lsn(),
+            example_metadata,
+        );

        let index_part_bytes = serde_json::to_vec(&example_index_part).unwrap();

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	30148035c9	more task names	2024-04-19 16:04:34 +01:00
Conrad Ludgate	f0fa688ad7	docker	2024-04-19 15:31:23 +01:00
Conrad Ludgate	39345e3f57	add task names	2024-04-19 15:11:43 +01:00
Conrad Ludgate	4d1b5992eb	custom runtime threads	2024-04-19 15:02:52 +01:00
Conrad Ludgate	e8a5e0b0ed	add tokio-console	2024-04-19 14:54:22 +01:00
Conrad Ludgate	1a979cd27e	add more logs to metrics output	2024-04-19 14:11:45 +01:00
Conrad Ludgate	278ba8f8b5	proxy: simplify compute ssl setup	2024-04-19 13:55:51 +01:00