Merge remote-tracking branch 'origin/main' into problame/async-timeline-get/timeline-layers-tokio-sync-atop-4333

2026-01-14 17:02:56 +00:00 · 2023-06-07 15:34:04 +02:00
parent 2001c31a14 5761190e0d
commit a66a16ddf9
117 changed files with 6008 additions and 2316 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,14 +36,6 @@ inputs:
    description: 'Region name for real s3 tests'
    required: false
    default: ''
-  real_s3_access_key_id:
-    description: 'Access key id'
-    required: false
-    default: ''
-  real_s3_secret_access_key:
-    description: 'Secret access key'
-    required: false
-    default: ''
  rerun_flaky:
    description: 'Whether to rerun flaky tests'
    required: false
@@ -104,8 +96,6 @@ runs:
        COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
-        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,10 +346,8 @@ jobs:
          test_selection: regress
          needs_postgres_source: true
          run_with_real_s3: true
-          real_s3_bucket: ci-tests-s3
-          real_s3_region: us-west-2
-          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
-          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
@@ -409,9 +407,7 @@ jobs:
        uses: ./.github/actions/allure-report-generate

      - uses: actions/github-script@v6
-        if: >
-          !cancelled() &&
-          github.event_name == 'pull_request'
+        if: ${{ !cancelled() }}
        with:
          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
          retries: 5
@@ -421,7 +417,7 @@ jobs:
              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
            }

-            const script = require("./scripts/pr-comment-test-report.js")
+            const script = require("./scripts/comment-test-report.js")
            await script({
              github,
              context,
@@ -496,19 +492,24 @@ jobs:
        env:
          COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          scripts/coverage \
-            --dir=/tmp/coverage report \
+          scripts/coverage --dir=/tmp/coverage \
+            report \
            --input-objects=/tmp/coverage/binaries.list \
            --commit-url=${COMMIT_URL} \
            --format=github

+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
      - name: Upload coverage report
        id: upload-coverage-report
        env:
          BUCKET: neon-github-public-dev
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}

          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
@@ -663,6 +664,9 @@ jobs:
          project: nrdv0s4kcs
          push: true
          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+          build-args: |
+            GIT_VERSION=${{ github.sha }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com

  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
@@ -777,7 +781,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.7.3-alpha3
+      VM_BUILDER_VERSION: v0.8.0

    steps:
      - name: Checkout
@@ -798,7 +802,7 @@ jobs:

      - name: Build vm image
        run: |
-          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ members = [
    "compute_tools",
    "control_plane",
    "pageserver",
+    "pageserver/ctl",
    "proxy",
    "safekeeper",
    "storage_broker",
@@ -22,7 +23,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.25"
+aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
--- a/6
+++ b/6
@@ -47,8 +47,7 @@ RUN set -e \
    && mold -run cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
-      --bin pageserver_binutils  \
-      --bin draw_timeline_dir \
+      --bin pagectl  \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
@@ -73,8 +72,7 @@ RUN set -e \

 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -517,6 +517,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
    cargo pgx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

+#########################################################################################
+#
+# Layer "pg-pgx-ulid-build"
+# Compile "pgx_ulid" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-pgx-ulid-build
+
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -547,6 +563,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -556,6 +573,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/hnsw \
        -s install

 #########################################################################################
@@ -632,6 +653,7 @@ RUN apt update &&  \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
+        libcurl4-openssl-dev \
        procps && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/8
+++ b/8
@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+	+@echo "Compiling hnsw $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf
+postgresql-libs cmake postgresql protobuf curl
 ```

 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -59,6 +59,9 @@ fn main() -> Result<()> {

    let matches = cli().get_matches();

+    let http_port = *matches
+        .get_one::<u16>("http-port")
+        .expect("http-port is required");
    let pgdata = matches
        .get_one::<String>("pgdata")
        .expect("PGDATA path is required");
@@ -178,7 +181,8 @@ fn main() -> Result<()> {

    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _http_handle =
+        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

    if !spec_set {
        // No spec provided, hang waiting for it.
@@ -286,6 +290,14 @@ fn cli() -> clap::Command {
    let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
    clap::Command::new("compute_ctl")
        .version(version)
+        .arg(
+            Arg::new("http-port")
+                .long("http-port")
+                .value_name("HTTP_PORT")
+                .default_value("3080")
+                .value_parser(clap::value_parser!(u16))
+                .required(false),
+        )
        .arg(
            Arg::new("connstr")
                .short('C')
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,19 +1,3 @@
-//
-// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
-// but there are several things that makes `PostgresNode` usage inconvenient in the
-// cloud:
-// - it inherits from `LocalEnv`, which contains **all-all** the information about
-//   a complete service running
-// - it uses `PageServerNode` with information about http endpoint, which we do not
-//   need in the cloud again
-// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
-//
-// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
-// attributes (not required for the cloud). Yet, it is still tempting to unify these
-// `PostgresNode` and `ComputeNode` and use one in both places.
-//
-// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
-//
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
@@ -106,26 +90,38 @@ pub struct ParsedSpec {
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
+        // Extract the options from the spec file that are needed to connect to
+        // the storage system.
+        //
+        // For backwards-compatibility, the top-level fields in the spec file
+        // may be empty. In that case, we need to dig them from the GUCs in the
+        // cluster.settings field.
        let pageserver_connstr = spec
-            .cluster
-            .settings
-            .find("neon.pageserver_connstring")
+            .pageserver_connstring
+            .clone()
+            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
            .ok_or("pageserver connstr should be provided")?;
        let storage_auth_token = spec.storage_auth_token.clone();
-        let tenant_id: TenantId = spec
-            .cluster
-            .settings
-            .find("neon.tenant_id")
-            .ok_or("tenant id should be provided")
-            .map(|s| TenantId::from_str(&s))?
-            .or(Err("invalid tenant id"))?;
-        let timeline_id: TimelineId = spec
-            .cluster
-            .settings
-            .find("neon.timeline_id")
-            .ok_or("timeline id should be provided")
-            .map(|s| TimelineId::from_str(&s))?
-            .or(Err("invalid timeline id"))?;
+        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
+            tenant_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.tenant_id")
+                .ok_or("tenant id should be provided")
+                .map(|s| TenantId::from_str(&s))?
+                .or(Err("invalid tenant id"))?
+        };
+        let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
+            timeline_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.timeline_id")
+                .ok_or("timeline id should be provided")
+                .map(|s| TimelineId::from_str(&s))?
+                .or(Err("invalid timeline id"))?
+        };

        Ok(ParsedSpec {
            spec,
@@ -295,8 +291,8 @@ impl ComputeNode {
        update_pg_hba(pgdata_path)?;

        match spec.mode {
-            ComputeMode::Primary | ComputeMode::Static(..) => {}
-            ComputeMode::Replica => {
+            ComputeMode::Primary => {}
+            ComputeMode::Replica | ComputeMode::Static(..) => {
                add_standby_signal(pgdata_path)?;
            }
        }
@@ -362,6 +358,8 @@ impl ComputeNode {
        };

        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        client.simple_query("SET neon.forward_ddl = false")?;
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
@@ -374,7 +372,7 @@ impl ComputeNode {

        info!(
            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id
+            spec.cluster.cluster_id.as_deref().unwrap_or("None")
        );

        Ok(())
@@ -403,7 +401,9 @@ impl ComputeNode {
        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
        if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
            handle_roles(&spec, &mut client)?;
            handle_databases(&spec, &mut client)?;
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
@@ -430,7 +430,7 @@ impl ComputeNode {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id,
+            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
            spec.tenant_id,
            spec.timeline_id,
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -5,6 +5,7 @@ use std::path::Path;

 use anyhow::Result;

+use crate::pg_helpers::escape_conf_value;
 use crate::pg_helpers::PgOptionsSerialize;
 use compute_api::spec::{ComputeMode, ComputeSpec};

@@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

-    writeln!(file, "# Managed by compute_ctl: begin")?;
+    // Write the postgresql.conf content from the spec file as is.
+    if let Some(conf) = &spec.cluster.postgresql_conf {
+        writeln!(file, "{}", conf)?;
+    }

    write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;

+    // Add options for connecting to storage
+    writeln!(file, "# Neon storage settings")?;
+    if let Some(s) = &spec.pageserver_connstring {
+        writeln!(
+            file,
+            "neon.pageserver_connstring='{}'",
+            escape_conf_value(s)
+        )?;
+    }
+    if !spec.safekeeper_connstrings.is_empty() {
+        writeln!(
+            file,
+            "neon.safekeepers='{}'",
+            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+        )?;
+    }
+    if let Some(s) = &spec.tenant_id {
+        writeln!(
+            file,
+            "neon.tenant_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+    if let Some(s) = &spec.timeline_id {
+        writeln!(
+            file,
+            "neon.timeline_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+
    match spec.mode {
        ComputeMode::Primary => {}
        ComputeMode::Static(lsn) => {
@@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        }
    }

-    writeln!(file, "# Managed by compute_ctl: end")?;
+    // If there are any extra options in the 'settings' field, append those
+    if spec.cluster.settings.is_some() {
+        writeln!(file, "# Managed by compute_ctl: begin")?;
+        write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
+        writeln!(file, "# Managed by compute_ctl: end")?;
+    }

    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {

 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+async fn serve(port: u16, state: Arc<ComputeNode>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], port));

    let make_service = make_service_fn(move |_conn| {
        let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(state: Arc<ComputeNode>) {
 }

 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);

    Ok(thread::Builder::new()
        .name("http-endpoint".into())
-        .spawn(move || serve(state))?)
+        .spawn(move || serve(port, state))?)
 }
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
        .init();
    tracing::info!("logging and tracing started");

+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
    Ok(())
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String {

 /// Escape a string so that it can be used in postgresql.conf.
 /// Same as escape_literal, currently.
-fn escape_conf_value(s: &str) -> String {
+pub fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

@@ -121,9 +121,8 @@ impl RoleExt for Role {
    /// string of arguments.
    fn to_pg_options(&self) -> String {
        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
-        // For now, we do not use generic `options` for roles. Once used, add
-        // `self.options.as_pg_options()` somewhere here.
-        let mut params: String = "LOGIN".to_string();
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(" LOGIN");

        if let Some(pass) = &self.encrypted_password {
            // Some time ago we supported only md5 and treated all encrypted_password as md5.
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -62,7 +62,7 @@ fn do_control_plane_request(
    }
 }

-/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
 /// env variable is set, it will be used for authorization.
 pub fn get_spec_from_control_plane(
    base_uri: &str,
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
        );
        assert_eq!(
            spec.cluster.roles.first().unwrap().to_pg_options(),
-            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+            " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
        );
    }

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: &str = "14";
+const DEFAULT_PG_VERSION: &str = "15";

 fn default_conf() -> String {
    format!(
@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -

            println!("Creating endpoint for imported timeline ...");
            cplane.new_endpoint(
-                tenant_id,
                name,
+                tenant_id,
                timeline_id,
                None,
+                None,
                pg_version,
                ComputeMode::Primary,
            )?;
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(

                table.add_row([
                    endpoint_id.as_str(),
-                    &endpoint.address.to_string(),
+                    &endpoint.pg_address.to_string(),
                    &endpoint.timeline_id.to_string(),
                    branch_name,
                    lsn_str.as_str(),
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_branch_timeline_id(branch_name, tenant_id)
                .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;

-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
-
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let pg_version = sub_args
                .get_one::<u32>("pg-version")
                .copied()
@@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
+            cplane.new_endpoint(
+                &endpoint_id,
+                tenant_id,
+                timeline_id,
+                pg_port,
+                http_port,
+                pg_version,
+                mode,
+            )?;
        }
        "start" => {
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            // If --safekeepers argument is given, use only the listed safekeeper nodes.
+            let safekeepers =
+                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+                    let mut safekeepers: Vec<NodeId> = Vec::new();
+                    for sk_id in safekeepers_str.split(',').map(str::trim) {
+                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
+                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
+                        })?);
+                        safekeepers.push(sk_id);
+                    }
+                    safekeepers
+                } else {
+                    env.safekeepers.iter().map(|sk| sk.id).collect()
+                };
+
            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token)?;
+                endpoint.start(&auth_token, safekeepers)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");

                let ep = cplane.new_endpoint(
-                    tenant_id,
                    endpoint_id,
+                    tenant_id,
                    timeline_id,
-                    port,
+                    pg_port,
+                    http_port,
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token)?;
+                ep.start(&auth_token, safekeepers)?;
            }
        }
        "stop" => {
@@ -944,11 +970,22 @@ fn cli() -> Command {
        .value_parser(value_parser!(u32))
        .default_value(DEFAULT_PG_VERSION);

-    let port_arg = Arg::new("port")
-        .long("port")
+    let pg_port_arg = Arg::new("pg-port")
+        .long("pg-port")
        .required(false)
        .value_parser(value_parser!(u16))
-        .value_name("port");
+        .value_name("pg-port");
+
+    let http_port_arg = Arg::new("http-port")
+        .long("http-port")
+        .required(false)
+        .value_parser(value_parser!(u16))
+        .value_name("http-port");
+
+    let safekeepers_arg = Arg::new("safekeepers")
+        .long("safekeepers")
+        .required(false)
+        .value_name("safekeepers");

    let stop_mode_arg = Arg::new("stop-mode")
        .short('m')
@@ -1093,7 +1130,8 @@ fn cli() -> Command {
                    .arg(branch_name_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
+                    .arg(pg_port_arg.clone())
+                    .arg(http_port_arg.clone())
                    .arg(
                        Arg::new("config-only")
                            .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1109,9 +1147,11 @@ fn cli() -> Command {
                    .arg(branch_name_arg)
                    .arg(timeline_id_arg)
                    .arg(lsn_arg)
-                    .arg(port_arg)
+                    .arg(pg_port_arg)
+                    .arg(http_port_arg)
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
+                    .arg(safekeepers_arg)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,3 +1,9 @@
+//! Code to manage the storage broker
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use anyhow::Context;

 use std::path::PathBuf;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -1,40 +1,71 @@
+//! Code to manage compute endpoints
+//!
+//! In the local test environment, the data for each endpoint is stored in
+//!
+//!   .neon/endpoints/<endpoint id>
+//!
+//! Some basic information about the endpoint, like the tenant and timeline IDs,
+//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
+//! when the endpoint is created, and doesn't change afterwards.
+//!
+//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
+//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
+//! the basebackup from the pageserver to initialize the the data directory, and
+//! finally launches the PostgreSQL process. It watches the PostgreSQL process
+//! until it exits.
+//!
+//! When an endpoint is created, a `postgresql.conf` file is also created in
+//! the endpoint's directory. The file can be modified before starting PostgreSQL.
+//! However, the `postgresql.conf` file in the endpoint directory is not used directly
+//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
+//! copy of it in the data directory.
+//!
+//! Directory contents:
+//!
+//! ```ignore
+//! .neon/endpoints/main/
+//!     compute.log               - log output of `compute_ctl` and `postgres`
+//!     endpoint.json             - serialized `EndpointConf` struct
+//!     postgresql.conf           - postgresql settings
+//!     spec.json                 - passed to `compute_ctl`
+//!     pgdata/
+//!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
+//!         zenith.signal
+//!         <other PostgreSQL files>
+//! ```
+//!
 use std::collections::BTreeMap;
-use std::fs::{self, File};
-use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
-use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use std::process::{Command, Stdio};
-use std::str::FromStr;
+use std::process::Command;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

-use compute_api::spec::ComputeMode;
+use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
-    name: String,
+    endpoint_id: String,
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
    #[serde_as(as = "DisplayFromStr")]
    timeline_id: TimelineId,
    mode: ComputeMode,
-    port: u16,
+    pg_port: u16,
+    http_port: u16,
    pg_version: u32,
 }

@@ -57,11 +88,11 @@ impl ComputeControlPlane {
        let pageserver = Arc::new(PageServerNode::from_env(&env));

        let mut endpoints = BTreeMap::default();
-        for endpoint_dir in fs::read_dir(env.endpoints_path())
+        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
-            endpoints.insert(ep.name.clone(), Arc::new(ep));
+            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
        }

        Ok(ComputeControlPlane {
@@ -76,25 +107,28 @@ impl ComputeControlPlane {
        1 + self
            .endpoints
            .values()
-            .map(|ep| ep.address.port())
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
            .max()
            .unwrap_or(self.base_port)
    }

+    #[allow(clippy::too_many_arguments)]
    pub fn new_endpoint(
        &mut self,
+        endpoint_id: &str,
        tenant_id: TenantId,
-        name: &str,
        timeline_id: TimelineId,
-        port: Option<u16>,
+        pg_port: Option<u16>,
+        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
    ) -> Result<Arc<Endpoint>> {
-        let port = port.unwrap_or_else(|| self.get_port());
-
+        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
+        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let ep = Arc::new(Endpoint {
-            name: name.to_owned(),
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            endpoint_id: endpoint_id.to_owned(),
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
            pageserver: Arc::clone(&self.pageserver),
            timeline_id,
@@ -102,21 +136,27 @@ impl ComputeControlPlane {
            tenant_id,
            pg_version,
        });
-        ep.create_pgdata()?;
+
+        ep.create_endpoint_dir()?;
        std::fs::write(
            ep.endpoint_path().join("endpoint.json"),
            serde_json::to_string_pretty(&EndpointConf {
-                name: name.to_string(),
+                endpoint_id: endpoint_id.to_string(),
                tenant_id,
                timeline_id,
                mode,
-                port,
+                http_port,
+                pg_port,
                pg_version,
            })?,
        )?;
-        ep.setup_pg_conf()?;
+        std::fs::write(
+            ep.endpoint_path().join("postgresql.conf"),
+            ep.setup_pg_conf()?.to_string(),
+        )?;

-        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
+        self.endpoints
+            .insert(ep.endpoint_id.clone(), Arc::clone(&ep));

        Ok(ep)
    }
@@ -127,13 +167,15 @@ impl ComputeControlPlane {
 #[derive(Debug)]
 pub struct Endpoint {
    /// used as the directory name
-    name: String,
+    endpoint_id: String,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mode: ComputeMode,

-    // port and address of the Postgres server
-    pub address: SocketAddr,
+    // port and address of the Postgres server and `compute_ctl`'s HTTP API
+    pub pg_address: SocketAddr,
+    pub http_address: SocketAddr,
+
    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,

@@ -158,16 +200,16 @@ impl Endpoint {

        // parse data directory name
        let fname = entry.file_name();
-        let name = fname.to_str().unwrap().to_string();
+        let endpoint_id = fname.to_str().unwrap().to_string();

        // Read the endpoint.json file
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        // ok now
        Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
-            name,
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
+            endpoint_id,
            env: env.clone(),
            pageserver: Arc::clone(pageserver),
            timeline_id: conf.timeline_id,
@@ -177,104 +219,17 @@ impl Endpoint {
        })
    }

-    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(pg_path);
-
-        cmd.arg("--sync-safekeepers")
-            .env_clear()
-            .env(
-                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env(
-                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env("PGDATA", self.pgdata().to_str().unwrap())
-            .stdout(Stdio::piped())
-            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped());
-
-        if let Some(token) = auth_token {
-            cmd.env("NEON_AUTH_TOKEN", token);
-        }
-
-        let sync_handle = cmd
-            .spawn()
-            .expect("postgres --sync-safekeepers failed to start");
-
-        let sync_output = sync_handle
-            .wait_with_output()
-            .expect("postgres --sync-safekeepers failed");
-        if !sync_output.status.success() {
-            anyhow::bail!(
-                "sync-safekeepers failed: '{}'",
-                String::from_utf8_lossy(&sync_output.stderr)
-            );
-        }
-
-        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Safekeepers synced on {}", lsn);
-        Ok(lsn)
-    }
-
-    /// Get basebackup from the pageserver as a tar archive and extract it
-    /// to the `self.pgdata()` directory.
-    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
-        println!(
-            "Extracting base backup to create postgres instance: path={} port={}",
-            self.pgdata().display(),
-            self.address.port()
-        );
-
-        let sql = if let Some(lsn) = lsn {
-            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
-        } else {
-            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
-        };
-
-        let mut client = self
-            .pageserver
-            .page_server_psql_client()
-            .context("connecting to page server failed")?;
-
-        let copyreader = client
-            .copy_out(sql.as_str())
-            .context("page server 'basebackup' command failed")?;
-
-        // Read the archive directly from the `CopyOutReader`
-        //
-        // Set `ignore_zeros` so that unpack() reads all the Copy data and
-        // doesn't stop at the end-of-archive marker. Otherwise, if the server
-        // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
-        ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata())
-            .context("extracting base backup failed")?;
-
-        Ok(())
-    }
-
-    fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(self.pgdata()).with_context(|| {
+    fn create_endpoint_dir(&self) -> Result<()> {
+        std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
            format!(
-                "could not create data directory {}",
-                self.pgdata().display()
+                "could not create endpoint directory {}",
+                self.endpoint_path().display()
            )
-        })?;
-        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
-            .with_context(|| {
-                format!(
-                    "could not set permissions in data directory {}",
-                    self.pgdata().display()
-                )
-            })
+        })
    }

-    // Write postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new endpoint.
-    fn setup_pg_conf(&self) -> Result<()> {
+    // Generate postgresql.conf with default configuration
+    fn setup_pg_conf(&self) -> Result<PostgresConf> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
        conf.append("wal_log_hints", "off");
@@ -287,25 +242,14 @@ impl Endpoint {
        // wal_sender_timeout is the maximum time to wait for WAL replication.
        // It also defines how often the walreciever will send a feedback message to the wal sender.
        conf.append("wal_sender_timeout", "5s");
-        conf.append("listen_addresses", &self.address.ip().to_string());
-        conf.append("port", &self.address.port().to_string());
+        conf.append("listen_addresses", &self.pg_address.ip().to_string());
+        conf.append("port", &self.pg_address.port().to_string());
        conf.append("wal_keep_size", "0");
        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
        conf.append("restart_after_crash", "off");

-        // Configure the Neon Postgres extension to fetch pages from pageserver
-        let pageserver_connstr = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
-
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
+        // Load the 'neon' extension
        conf.append("shared_preload_libraries", "neon");
-        conf.append_line("");
-        conf.append("neon.pageserver_connstring", &pageserver_connstr);
-        conf.append("neon.tenant_id", &self.tenant_id.to_string());
-        conf.append("neon.timeline_id", &self.timeline_id.to_string());

        conf.append_line("");
        // Replication-related configurations, such as WAL sending
@@ -390,46 +334,11 @@ impl Endpoint {
            }
        }

-        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
-        file.write_all(conf.to_string().as_bytes())?;
-
-        let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
-        file.write_all(self.pg_version.to_string().as_bytes())?;
-
-        Ok(())
-    }
-
-    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = match &self.mode {
-            ComputeMode::Primary => {
-                if !self.env.safekeepers.is_empty() {
-                    // LSN 0 means that it is bootstrap and we need to download just
-                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-                    // procedure evolves quite actively right now, so let's think about it again
-                    // when things would be more stable (TODO).
-                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-                    if lsn == Lsn(0) {
-                        None
-                    } else {
-                        Some(lsn)
-                    }
-                } else {
-                    None
-                }
-            }
-            ComputeMode::Static(lsn) => Some(*lsn),
-            ComputeMode::Replica => {
-                None // Take the latest snapshot available to start with
-            }
-        };
-
-        self.do_basebackup(backup_lsn)?;
-
-        Ok(())
+        Ok(conf)
    }

    pub fn endpoint_path(&self) -> PathBuf {
-        self.env.endpoints_path().join(&self.name)
+        self.env.endpoints_path().join(&self.endpoint_id)
    }

    pub fn pgdata(&self) -> PathBuf {
@@ -439,7 +348,7 @@ impl Endpoint {
    pub fn status(&self) -> &str {
        let timeout = Duration::from_millis(300);
        let has_pidfile = self.pgdata().join("postmaster.pid").exists();
-        let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
+        let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();

        match (has_pidfile, can_connect) {
            (true, true) => "running",
@@ -457,8 +366,6 @@ impl Endpoint {
                &[
                    "-D",
                    self.pgdata().to_str().unwrap(),
-                    "-l",
-                    self.pgdata().join("pg.log").to_str().unwrap(),
                    "-w", //wait till pg_ctl actually does what was asked
                ],
                args,
@@ -494,36 +401,183 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }

-        // 1. We always start Postgres from scratch, so
-        // if old dir exists, preserve 'postgresql.conf' and drop the directory
-        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
-        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
-            format!(
-                "failed to read config file in {}",
-                postgresql_conf_path.to_str().unwrap()
-            )
-        })?;
-        fs::remove_dir_all(self.pgdata())?;
-        self.create_pgdata()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => String::from_utf8(content)?,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
+            Err(e) => {
+                return Err(anyhow::Error::new(e).context(format!(
+                    "failed to read config file in {}",
+                    postgresql_conf_path.to_str().unwrap()
+                )))
+            }
+        };

-        // 2. Bring back config files
-        fs::write(&postgresql_conf_path, postgresql_conf)?;
-
-        // 3. Load basebackup
-        self.load_basebackup(auth_token)?;
-
-        if self.mode != ComputeMode::Primary {
-            File::create(self.pgdata().join("standby.signal"))?;
+        // We always start the compute node from scratch, so if the Postgres
+        // data dir exists from a previous launch, remove it first.
+        if self.pgdata().exists() {
+            std::fs::remove_dir_all(self.pgdata())?;
        }

-        // 4. Finally start postgres
-        println!("Starting postgres at '{}'", self.connstr());
-        self.pg_ctl(&["start"], auth_token)
+        let pageserver_connstring = {
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());
+
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        };
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in safekeepers {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
+            }
+        }
+
+        // Create spec file
+        let spec = ComputeSpec {
+            format_version: 1.0,
+            operation_uuid: None,
+            cluster: Cluster {
+                cluster_id: None, // project ID: not used
+                name: None,       // project name: not used
+                state: None,
+                roles: vec![],
+                databases: vec![],
+                settings: None,
+                postgresql_conf: Some(postgresql_conf),
+            },
+            delta_operations: None,
+            tenant_id: Some(self.tenant_id),
+            timeline_id: Some(self.timeline_id),
+            mode: self.mode,
+            pageserver_connstring: Some(pageserver_connstring),
+            safekeeper_connstrings,
+            storage_auth_token: auth_token.clone(),
+        };
+        let spec_path = self.endpoint_path().join("spec.json");
+        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+
+        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
+        let logfile = std::fs::OpenOptions::new()
+            .create(true)
+            .append(true)
+            .open(self.endpoint_path().join("compute.log"))?;
+
+        // Launch compute_ctl
+        println!("Starting postgres node at '{}'", self.connstr());
+        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+        cmd.args(["--http-port", &self.http_address.port().to_string()])
+            .args(["--pgdata", self.pgdata().to_str().unwrap()])
+            .args(["--connstr", &self.connstr()])
+            .args([
+                "--spec-path",
+                self.endpoint_path().join("spec.json").to_str().unwrap(),
+            ])
+            .args([
+                "--pgbin",
+                self.env
+                    .pg_bin_dir(self.pg_version)?
+                    .join("postgres")
+                    .to_str()
+                    .unwrap(),
+            ])
+            .stdin(std::process::Stdio::null())
+            .stderr(logfile.try_clone()?)
+            .stdout(logfile);
+        let _child = cmd.spawn()?;
+
+        // Wait for it to start
+        let mut attempt = 0;
+        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
+        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        loop {
+            attempt += 1;
+            match self.get_status() {
+                Ok(state) => {
+                    match state.status {
+                        ComputeStatus::Init => {
+                            if attempt == MAX_ATTEMPTS {
+                                bail!("compute startup timed out; still in Init state");
+                            }
+                            // keep retrying
+                        }
+                        ComputeStatus::Running => {
+                            // All good!
+                            break;
+                        }
+                        ComputeStatus::Failed => {
+                            bail!(
+                                "compute startup failed: {}",
+                                state
+                                    .error
+                                    .as_deref()
+                                    .unwrap_or("<no error from compute_ctl>")
+                            );
+                        }
+                        ComputeStatus::Empty
+                        | ComputeStatus::ConfigurationPending
+                        | ComputeStatus::Configuration => {
+                            bail!("unexpected compute status: {:?}", state.status)
+                        }
+                    }
+                }
+                Err(e) => {
+                    if attempt == MAX_ATTEMPTS {
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
+                    }
+                }
+            }
+            std::thread::sleep(ATTEMPT_INTERVAL);
+        }
+
+        Ok(())
+    }
+
+    // Call the /status HTTP API
+    pub fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::blocking::Client::new();
+
+        let response = client
+            .request(
+                reqwest::Method::GET,
+                format!(
+                    "http://{}:{}/status",
+                    self.http_address.ip(),
+                    self.http_address.port()
+                ),
+            )
+            .send()?;
+
+        // Interpret the response
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(response.json()?)
+        } else {
+            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
    }

    pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -540,7 +594,7 @@ impl Endpoint {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            fs::remove_dir_all(self.endpoint_path())?;
+            std::fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
@@ -549,10 +603,10 @@ impl Endpoint {

    pub fn connstr(&self) -> String {
        format!(
-            "host={} port={} user={} dbname={}",
-            self.address.ip(),
-            self.address.port(),
+            "postgresql://{}@{}:{}/{}",
            "cloud_admin",
+            self.pg_address.ip(),
+            self.pg_address.port(),
            "postgres"
        )
    }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use utils::{

 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 //
 // This data structures represents neon_local CLI config
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
    // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute nodes).
+    // compute endpoints).
    //
    // This is not stored in the config file. Rather, this is the path where the
    // config file itself is. It is read from the NEON_REPO_DIR env variable or
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,3 +1,9 @@
+//! Code to manage pageservers
+//!
+//! In the local test environment, the pageserver stores its data directly in
+//!
+//!   .neon/
+//!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
@@ -369,7 +375,16 @@ impl PageServerNode {
            evictions_low_residence_duration_metric_threshold: settings
                .remove("evictions_low_residence_duration_metric_threshold")
                .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
        };
+
+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
        let request = models::TenantCreateRequest {
            new_tenant_id,
            config,
@@ -459,6 +474,11 @@ impl PageServerNode {
                evictions_low_residence_duration_metric_threshold: settings
                    .remove("evictions_low_residence_duration_metric_threshold")
                    .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
            }
        };

@@ -495,6 +515,9 @@ impl PageServerNode {
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
    ) -> anyhow::Result<TimelineInfo> {
+        // If timeline ID was not specified, generate one
+        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
+
        self.http_request(
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,3 +1,9 @@
+//! Code to manage safekeepers
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 set -eux

+# Generate a random tenant or timeline ID
+#
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+
 PG_VERSION=${PG_VERSION:-14}

 SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -13,29 +21,29 @@ done
 echo "Page server is ready."

 echo "Create a tenant and timeline"
+generate_id tenant_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{}"
+     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
     http://pageserver:9898/v1/tenant/
 )
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .

+generate_id timeline_id
 PARAMS=(
     -sb 
     -X POST
     -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .

 echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
 sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations
 by spawning a separate task. The code that handles incoming HTTP
 requests, for example, spawns a separate task for each request,
 because Hyper will drop the request-handling Future if the HTTP
-connection is lost.  (FIXME: our HTTP handlers do not do that
-currently, but we should fix that. See [issue
-3478](https://github.com/neondatabase/neon/issues/3478)).
+connection is lost.


 #### How to cancel, then?
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};

 use crate::spec::ComputeSpec;

-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
    pub error: String,
 }

 /// Response of the /status API
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
    pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
    pub error: Option<String>,
 }

-#[derive(Serialize)]
+#[derive(Deserialize, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeState {
    pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
    pub error: Option<String>,
 }

-#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
    // Spec wasn't provided at start, waiting for it to be
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,6 +5,7 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 /// String type alias representing Postgres identifier and
@@ -14,7 +15,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
    pub format_version: f32,

@@ -26,9 +27,32 @@ pub struct ComputeSpec {
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,

+    // Information needed to connect to the storage layer.
+    //
+    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
+    //
+    // Depending on `mode`, this can be a primary read-write node, a read-only
+    // replica, or a read-only node pinned at an older LSN.
+    // `safekeeper_connstrings` must be set for a primary.
+    //
+    // For backwards compatibility, the control plane may leave out all of
+    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
+    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
+    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub tenant_id: Option<TenantId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub pageserver_connstring: Option<String>,
+    #[serde(default)]
+    pub safekeeper_connstrings: Vec<String>,
+
    #[serde(default)]
    pub mode: ComputeMode,

+    /// If set, 'storage_auth_token' is used as the password to authenticate to
+    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
 }

@@ -47,13 +71,19 @@ pub enum ComputeMode {
    Replica,
 }

-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
+    pub cluster_id: Option<String>,
+    pub name: Option<String>,
    pub state: Option<String>,
    pub roles: Vec<Role>,
    pub databases: Vec<Database>,
+
+    /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
+    /// tool may add additional settings to the final file.)
+    pub postgresql_conf: Option<String>,
+
+    /// Additional settings that will be appended to the 'postgresql.conf' file.
    pub settings: GenericOptions,
 }

@@ -63,7 +93,7 @@ pub struct Cluster {
 /// - DROP ROLE
 /// - ALTER ROLE name RENAME TO new_name
 /// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct DeltaOp {
    pub action: String,
    pub name: PgIdent,
@@ -72,7 +102,7 @@ pub struct DeltaOp {

 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
@@ -81,7 +111,7 @@ pub struct Role {

 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
@@ -91,7 +121,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct GenericOption {
    pub name: String,
    pub value: Option<String>,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};

-/// A state of a tenant in pageserver's memory.
+/// The state of a tenant in this pageserver.
+///
+/// ```mermaid
+/// stateDiagram-v2
+///
+///     [*] --> Loading: spawn_load()
+///     [*] --> Attaching: spawn_attach()
+///
+///     Loading --> Activating: activate()
+///     Attaching --> Activating: activate()
+///     Activating --> Active: infallible
+///
+///     Loading --> Broken: load() failure
+///     Attaching --> Broken: attach() failure
+///
+///     Active --> Stopping: set_stopping(), part of shutdown & detach
+///     Stopping --> Broken: late error in remove_tenant_from_memory
+///
+///     Broken --> [*]: ignore / detach / shutdown
+///     Stopping --> [*]: remove_from_memory complete
+///
+///     Active --> Broken: cfg(testing)-only tenant break point
+/// ```
 #[derive(
    Clone,
    PartialEq,
@@ -26,43 +48,63 @@ use bytes::{BufMut, Bytes, BytesMut};
    serde::Serialize,
    serde::Deserialize,
    strum_macros::Display,
-    strum_macros::EnumString,
    strum_macros::EnumVariantNames,
    strum_macros::AsRefStr,
    strum_macros::IntoStaticStr,
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Loading,
-    /// This tenant is being downloaded from cloud storage.
+    /// This tenant is being attached to the pageserver.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
    Attaching,
    /// The tenant is transitioning from Loading/Attaching to Active.
-    Activating,
-    /// Tenant is fully operational
+    ///
+    /// While in this state, the individual timelines are being activated.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
+    Activating(ActivatingFrom),
+    /// The tenant has finished activating and is open for business.
+    ///
+    /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
    Active,
-    /// A tenant is recognized by pageserver, but it is being detached or the
+    /// The tenant is recognized by pageserver, but it is being detached or the
    /// system is being shut down.
+    ///
+    /// Transitions out of this state are possible through `set_broken()`.
    Stopping,
-    /// A tenant is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The tenant is recognized by the pageserver, but can no longer be used for
+    /// any operations.
+    ///
+    /// If the tenant fails to load or attach, it will transition to this state
+    /// and it is guaranteed that no background tasks are running in its name.
+    ///
+    /// The other way to transition into this state is from `Stopping` state
+    /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
+    /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
    Broken { reason: String, backtrace: String },
 }

 impl TenantState {
    pub fn attachment_status(&self) -> TenantAttachmentStatus {
        use TenantAttachmentStatus::*;
+
+        // Below TenantState::Activating is used as "transient" or "transparent" state for
+        // attachment_status determining.
        match self {
            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
            // So, technically, we can return Attached here.
            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
            // But, our attach task might still be fetching the remote timelines, etc.
            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
-            Self::Attaching => Maybe,
+            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
            // tenant mgr startup distinguishes attaching from loading via marker file.
            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
-            Self::Loading => Attached,
-            Self::Activating => todo!(),
+            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
            // We only reach Active after successful load / attach.
            // So, call atttachment status Attached.
            Self::Active => Attached,
@@ -101,6 +143,15 @@ impl std::fmt::Debug for TenantState {
    }
 }

+/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum ActivatingFrom {
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
+    Loading,
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
+    Attaching,
+}
+
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
@@ -121,9 +172,8 @@ pub enum TimelineState {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_timeline_id: TimelineId,
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub ancestor_timeline_id: Option<TimelineId>,
@@ -134,12 +184,11 @@ pub struct TimelineCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_tenant_id: Option<TenantId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_tenant_id: TenantId,
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -174,6 +223,7 @@ pub struct TenantConfig {
    pub eviction_policy: Option<serde_json::Value>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
 }

 #[serde_as]
@@ -187,10 +237,10 @@ pub struct StatusResponse {
 }

 impl TenantCreateRequest {
-    pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
        TenantCreateRequest {
            new_tenant_id,
-            ..Default::default()
+            config: TenantConfig::default(),
        }
    }
 }
@@ -232,6 +282,7 @@ impl TenantConfigRequest {
            eviction_policy: None,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: None,
+            gc_feedback: None,
        };
        TenantConfigRequest { tenant_id, config }
    }
@@ -834,4 +885,55 @@ mod tests {
            err
        );
    }
+
+    #[test]
+    fn tenantstatus_activating_serde() {
+        let states = [
+            TenantState::Activating(ActivatingFrom::Loading),
+            TenantState::Activating(ActivatingFrom::Attaching),
+        ];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+
+        let actual = serde_json::to_string(&states).unwrap();
+
+        assert_eq!(actual, expected);
+
+        let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
+
+        assert_eq!(states.as_slice(), &parsed);
+    }
+
+    #[test]
+    fn tenantstatus_activating_strum() {
+        // tests added, because we use these for metrics
+        let examples = [
+            (line!(), TenantState::Loading, "Loading"),
+            (line!(), TenantState::Attaching, "Attaching"),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Loading),
+                "Activating",
+            ),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Attaching),
+                "Activating",
+            ),
+            (line!(), TenantState::Active, "Active"),
+            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Broken {
+                    reason: "Example".into(),
+                    backtrace: "Looooong backtrace".into(),
+                },
+                "Broken",
+            ),
+        ];
+
+        for (line, rendered, expected) in examples {
+            let actual: &'static str = rendered.into();
+            assert_eq!(actual, expected, "example on {line}");
+        }
+    }
 }
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};
+
+/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
+///
+/// Can be cloned, moved and kept around in futures as "guard objects".
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);
+
+/// Barrier will wait until all clones of [`Completion`] have been dropped.
+#[derive(Clone)]
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+
+impl Barrier {
+    pub async fn wait(self) {
+        self.0.lock().await.recv().await;
+    }
+
+    pub async fn maybe_wait(barrier: Option<Barrier>) {
+        if let Some(b) = barrier {
+            b.wait().await
+        }
+    }
+}
+
+/// Create new Guard and Barrier pair.
+pub fn channel() -> (Completion, Barrier) {
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
+}
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,5 +1,5 @@
 use crate::auth::{Claims, JwtAuth};
-use crate::http::error;
+use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
@@ -16,8 +16,6 @@ use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;

-use super::error::ApiError;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
@@ -35,8 +33,18 @@ struct RequestId(String);
 /// Adds a tracing info_span! instrumentation around the handler events,
 /// logs the request start and end events for non-GET requests and non-200 responses.
 ///
+/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
+///
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
-/// in this type will get request info logged in the wrapping span, including the unique request ID.
+/// with this will get request info logged in the wrapping span, including the unique request ID.
+///
+/// This also handles errors, logging them and converting them to an HTTP error response.
+///
+/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
+/// completion. In other words, the handler must be async cancellation safe! request_span
+/// prints a warning to the log when that happens, so that you have some trace of it in
+/// the log.
+///
 ///
 /// There could be other ways to implement similar functionality:
 ///
@@ -54,60 +62,56 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<E, R, H>(pub H)
+pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
-
-impl<E, R, H> RequestSpan<E, R, H>
-where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
-    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
-    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
-        let request_id = request.context::<RequestId>().unwrap_or_default().0;
-        let method = request.method();
-        let path = request.uri().path();
-        let request_span = info_span!("request", %method, %path, %request_id);
+    let request_id = request.context::<RequestId>().unwrap_or_default().0;
+    let method = request.method();
+    let path = request.uri().path();
+    let request_span = info_span!("request", %method, %path, %request_id);

-        let log_quietly = method == Method::GET;
-        async move {
-            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
-            if log_quietly {
-                debug!("Handling request");
-            } else {
-                info!("Handling request");
-            }
-
-            // Note that we reuse `error::handler` here and not returning and error at all,
-            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
-            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
-            //
-            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
-            let res = (self.0)(request).await;
-
-            cancellation_guard.disarm();
-
-            match res {
-                Ok(response) => {
-                    let response_status = response.status();
-                    if log_quietly && response_status.is_success() {
-                        debug!("Request handled, status: {response_status}");
-                    } else {
-                        info!("Request handled, status: {response_status}");
-                    }
-                    Ok(response)
-                }
-                Err(e) => Ok(error::handler(e.into()).await),
-            }
+    let log_quietly = method == Method::GET;
+    async move {
+        let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+        if log_quietly {
+            debug!("Handling request");
+        } else {
+            info!("Handling request");
+        }
+
+        // No special handling for panics here. There's a `tracing_panic_hook` from another
+        // module to do that globally.
+        let res = handler(request).await;
+
+        cancellation_guard.disarm();
+
+        // Log the result if needed.
+        //
+        // We also convert any errors into an Ok response with HTTP error code here.
+        // `make_router` sets a last-resort error handler that would do the same, but
+        // we prefer to do it here, before we exit the request span, so that the error
+        // is still logged with the span.
+        //
+        // (Because we convert errors to Ok response, we never actually return an error,
+        // and we could declare the function to return the never type (`!`). However,
+        // using `routerify::RouterBuilder` requires a proper error type.)
+        match res {
+            Ok(response) => {
+                let response_status = response.status();
+                if log_quietly && response_status.is_success() {
+                    debug!("Request handled, status: {response_status}");
+                } else {
+                    info!("Request handled, status: {response_status}");
+                }
+                Ok(response)
+            }
+            Err(err) => Ok(api_error_handler(err)),
        }
-        .instrument(request_span)
-        .await
    }
+    .instrument(request_span)
+    .await
 }

 /// Drop guard to WARN in case the request was dropped before completion.
@@ -207,10 +211,8 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
-        .get("/metrics", |r| {
-            RequestSpan(prometheus_metrics_handler).handle(r)
-        })
-        .err_handler(error::handler)
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .err_handler(route_error_handler)
 }

 pub fn attach_openapi_ui(
@@ -220,12 +222,14 @@ pub fn attach_openapi_ui(
    ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    router_builder
-        .get(spec_mount_path, move |r| {
-            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
-                .handle(r)
-        })
-        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
-            Ok(Response::builder().body(Body::from(format!(r#"
+        .get(spec_mount_path,
+            move |r| request_span(r, move |_| async move {
+                Ok(Response::builder().body(Body::from(spec)).unwrap())
+            })
+        )
+        .get(ui_mount_path,
+             move |r| request_span(r, move |_| async move {
+                 Ok(Response::builder().body(Body::from(format!(r#"
                <!DOCTYPE html>
                <html lang="en">
                <head>
@@ -255,7 +259,8 @@ pub fn attach_openapi_ui(
                </body>
                </html>
            "#, spec_mount_path))).unwrap())
-        }).handle(r))
+             })
+        )
 }

 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -83,13 +83,24 @@ impl HttpErrorBody {
    }
 }

-pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    let api_error = err
-        .downcast::<ApiError>()
-        .expect("handler should always return api error");
+pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
+    match err.downcast::<ApiError>() {
+        Ok(api_error) => api_error_handler(*api_error),
+        Err(other_error) => {
+            // We expect all the request handlers to return an ApiError, so this should
+            // not be reached. But just in case.
+            error!("Error processing HTTP request: {other_error:?}");
+            HttpErrorBody::response_from_msg_and_status(
+                other_error.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            )
+        }
+    }
+}

+pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+    if let ApiError::InternalServerError(_) = api_error {
        error!("Error processing HTTP request: {api_error:?}");
    } else {
        error!("Error processing HTTP request: {api_error:#}");
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;

 pub mod rate_limit;

+/// Simple once-barrier and a guard which keeps barrier awaiting.
+pub mod completion;
+
 mod failpoint_macro_helpers {

    /// use with fail::cfg("$name", "return(2000)")
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pagectl"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+clap = { workspace = true, features = ["string"] }
+git-version.workspace = true
+pageserver = { path = ".." }
+postgres_ffi.workspace = true
+utils.workspace = true
+svg_fmt.workspace = true
+workspace_hack.workspace = true
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -12,7 +12,7 @@
 //! Example use:
 //! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
    (keys, lsns)
 }

-fn main() -> Result<()> {
+pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
    let stdin = io::stdin();
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -6,7 +6,7 @@ use anyhow::Result;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
+use std::{fs, path::Path, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile;

 use utils::{bin_ser::BeSer, lsn::Lsn};

+use crate::AnalyzeLayerMapCmd;
+
 const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
 const DEFAULT_MAX_HOLES: usize = 10;

 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(PartialEq, Eq)]
-struct Hole(Range<Key>);
+pub struct Hole(Range<Key>);

 impl Ord for Hole {
    fn cmp(&self, other: &Self) -> Ordering {
@@ -39,11 +41,11 @@ impl PartialOrd for Hole {
    }
 }

-struct LayerFile {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-    is_delta: bool,
-    holes: Vec<Hole>,
+pub(crate) struct LayerFile {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
+    pub holes: Vec<Hole>,
 }

 impl LayerFile {
@@ -67,7 +69,7 @@ impl LayerFile {
    }
 }

-fn parse_filename(name: &str) -> Option<LayerFile> {
+pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
    let split: Vec<&str> = name.split("__").collect();
    if split.len() != 2 {
        return None;
@@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    Ok(holes)
 }

-fn main() -> Result<()> {
-    let args: Vec<String> = env::args().collect();
-    if args.len() < 2 {
-        println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
-        return Ok(());
-    }
-    let storage_path = PathBuf::from_str(&args[1])?;
-    let max_holes = if args.len() > 2 {
-        args[2].parse::<usize>().unwrap()
-    } else {
-        DEFAULT_MAX_HOLES
-    };
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+    let storage_path = &cmd.path;
+    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);

    // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
    pageserver::virtual_file::init(10);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -0,0 +1,169 @@
+use std::path::{Path, PathBuf};
+
+use anyhow::Result;
+use clap::Subcommand;
+use pageserver::tenant::block_io::BlockCursor;
+use pageserver::tenant::disk_btree::DiskBtreeReader;
+use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::{page_cache, virtual_file};
+use pageserver::{
+    repository::{Key, KEY_SIZE},
+    tenant::{
+        block_io::FileBlockReader, disk_btree::VisitDirection,
+        storage_layer::delta_layer::DELTA_KEY_SIZE,
+    },
+    virtual_file::VirtualFile,
+};
+use std::fs;
+use utils::bin_ser::BeSer;
+
+use crate::layer_map_analyzer::parse_filename;
+
+#[derive(Subcommand)]
+pub(crate) enum LayerCmd {
+    /// List all tenants and timelines under the pageserver path
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    List { path: PathBuf },
+    /// List all layers of a given tenant and timeline
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    ListLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+    },
+    /// Dump all information of a layer file
+    DumpLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
+}
+
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
+    use pageserver::tenant::block_io::BlockReader;
+
+    let path = path.as_ref();
+    virtual_file::init(10);
+    page_cache::init(100);
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
+    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+        actual_summary.index_start_blk,
+        actual_summary.index_root_blk,
+        &file,
+    );
+    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
+    let mut all = vec![];
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
+    for (k, v) in all {
+        let value = cursor.read_blob(v.pos())?;
+        println!("key:{} value_len:{}", k, value.len());
+    }
+    // TODO(chi): special handling for last key?
+    Ok(())
+}
+
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+    match cmd {
+        LayerCmd::List { path } => {
+            for tenant in fs::read_dir(path.join("tenants"))? {
+                let tenant = tenant?;
+                if !tenant.file_type()?.is_dir() {
+                    continue;
+                }
+                println!("tenant {}", tenant.file_name().to_string_lossy());
+                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                    let timeline = timeline?;
+                    if !timeline.file_type()?.is_dir() {
+                        continue;
+                    }
+                    println!("- timeline {}", timeline.file_name().to_string_lossy());
+                }
+            }
+        }
+        LayerCmd::ListLayer {
+            path,
+            tenant,
+            timeline,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    println!(
+                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                        idx,
+                        layer_file.key_range.start,
+                        layer_file.key_range.end,
+                        layer_file.lsn_range.start,
+                        layer_file.lsn_range.end,
+                        layer_file.is_delta,
+                    );
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::DumpLayer {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        // TODO(chi): dedup code
+                        println!(
+                            "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                            idx,
+                            layer_file.key_range.start,
+                            layer_file.key_range.end,
+                            layer_file.lsn_range.start,
+                            layer_file.lsn_range.end,
+                            layer_file.is_delta,
+                        );
+
+                        if layer_file.is_delta {
+                            read_delta_file(layer.path())?;
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+    }
+    Ok(())
+}
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -0,0 +1,179 @@
+//! A helper tool to manage pageserver binary files.
+//! Accepts a file as an argument, attempts to parse it with all ways possible
+//! and prints its interpreted context.
+//!
+//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
+
+mod draw_timeline_dir;
+mod layer_map_analyzer;
+mod layers;
+
+use clap::{Parser, Subcommand};
+use layers::LayerCmd;
+use pageserver::{
+    context::{DownloadBehavior, RequestContext},
+    page_cache,
+    task_mgr::TaskKind,
+    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
+    virtual_file,
+};
+use postgres_ffi::ControlFileData;
+use std::path::{Path, PathBuf};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver binutils",
+    long_about = "Reads pageserver (and related) binary files management utility"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Metadata(MetadataCmd),
+    PrintLayerFile(PrintLayerFileCmd),
+    DrawTimeline {},
+    AnalyzeLayerMap(AnalyzeLayerMapCmd),
+    #[command(subcommand)]
+    Layer(LayerCmd),
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct MetadataCmd {
+    /// Input metadata file path
+    metadata_path: PathBuf,
+    /// Replace disk consistent Lsn
+    disk_consistent_lsn: Option<Lsn>,
+    /// Replace previous record Lsn
+    prev_record_lsn: Option<Lsn>,
+    /// Replace latest gc cuttoff
+    latest_gc_cuttoff: Option<Lsn>,
+}
+
+#[derive(Parser)]
+struct PrintLayerFileCmd {
+    /// Pageserver data path
+    path: PathBuf,
+}
+
+#[derive(Parser)]
+struct AnalyzeLayerMapCmd {
+    /// Pageserver data path
+    path: PathBuf,
+    /// Max holes
+    max_holes: Option<usize>,
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    match cli.command {
+        Commands::Layer(cmd) => {
+            layers::main(&cmd)?;
+        }
+        Commands::Metadata(cmd) => {
+            handle_metadata(&cmd)?;
+        }
+        Commands::DrawTimeline {} => {
+            draw_timeline_dir::main()?;
+        }
+        Commands::AnalyzeLayerMap(cmd) => {
+            layer_map_analyzer::main(&cmd)?;
+        }
+        Commands::PrintLayerFile(cmd) => {
+            if let Err(e) = read_pg_control_file(&cmd.path) {
+                println!(
+                    "Failed to read input file as a pg control one: {e:#}\n\
+                    Attempting to read it as layer file"
+                );
+                print_layerfile(&cmd.path)?;
+            }
+        }
+    };
+    Ok(())
+}
+
+fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
+    println!("{control_file:?}");
+    let control_file_initdb = Lsn(control_file.checkPoint);
+    println!(
+        "pg_initdb_lsn: {}, aligned: {}",
+        control_file_initdb,
+        control_file_initdb.align()
+    );
+    Ok(())
+}
+
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+    page_cache::init(100);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    dump_layerfile_from_path(path, true, &ctx)
+}
+
+fn handle_metadata(
+    MetadataCmd {
+        metadata_path: path,
+        disk_consistent_lsn,
+        prev_record_lsn,
+        latest_gc_cuttoff,
+    }: &MetadataCmd,
+) -> Result<(), anyhow::Error> {
+    let metadata_bytes = std::fs::read(path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{meta:?}");
+    let mut update_meta = false;
+    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
+        meta = TimelineMetadata::new(
+            *disk_consistent_lsn,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(prev_record_lsn) = prev_record_lsn {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(*prev_record_lsn),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            *latest_gc_cuttoff,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(path, metadata_bytes)?;
+    }
+
+    Ok(())
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -275,6 +275,7 @@ fn start_pageserver(
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

    // Launch broker client
+    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
        .block_on(async {
            // Note: we do not attempt connecting here (but validate endpoints sanity).
@@ -334,13 +335,119 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Startup staging or optimizing:
+    //
+    // We want to minimize downtime for `page_service` connections, and trying not to overload
+    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
+    //
+    // init_done_rx will notify when all initial load operations have completed.
+    //
+    // background_jobs_can_start (same name used to hold off background jobs from starting at
+    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
+    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
+    // (background_task_maximum_delay).
+    let (init_done_tx, init_done_rx) = utils::completion::channel();
+
+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
+    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
+
+    let order = pageserver::InitializationOrder {
+        initial_tenant_load: Some(init_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: init_logical_size_done_tx,
+        background_jobs_can_start: background_jobs_barrier.clone(),
+    };
+
    // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        broker_client.clone(),
        remote_storage.clone(),
+        order,
    ))?;

+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx;
+        let shutdown_pageserver = shutdown_pageserver.clone();
+        let drive_init = async move {
+            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+
+            init_done_rx.wait().await;
+            // initial logical sizes can now start, as they were waiting on init_done_rx.
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed"
+            );
+
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+
+            let timeout = conf.background_task_maximum_delay;
+
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
+                    None
+                }
+                _ = tokio::time::sleep(timeout) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            // allow background jobs to start
+            drop(background_jobs_can_start);
+
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;
+
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );
+
+            }
+        };
+
+        async move {
+            let mut drive_init = std::pin::pin!(drive_init);
+            // just race these tasks
+            tokio::select! {
+                _ = shutdown_pageserver.cancelled() => {},
+                _ = &mut drive_init => {},
+            }
+        }
+    });
+
    // shared state between the disk-usage backed eviction background task and the http endpoint
    // that allows triggering disk-usage based eviction manually. note that the http endpoint
    // is still accessible even if background task is not configured as long as remote storage has
@@ -352,6 +459,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            background_jobs_barrier.clone(),
        )?;
    }

@@ -389,6 +497,7 @@ fn start_pageserver(
        );

        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let background_jobs_barrier = background_jobs_barrier;
            let metrics_ctx = RequestContext::todo_child(
                TaskKind::MetricsCollection,
                // This task itself shouldn't download anything.
@@ -404,6 +513,18 @@ fn start_pageserver(
                "consumption metrics collection",
                true,
                async move {
+                    // first wait until background jobs are cleared to launch.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let cancel = task_mgr::shutdown_token();
+
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
+
                    pageserver::consumption_metrics::collect_metrics(
                        metric_collection_endpoint,
                        conf.metric_collection_interval,
@@ -452,6 +573,8 @@ fn start_pageserver(
        );
    }

+    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
+
    // All started up! Now just sit and wait for shutdown signal.
    ShutdownSignals::handle(|signal| match signal {
        Signal::Quit => {
@@ -467,6 +590,11 @@ fn start_pageserver(
                "Got {}. Terminating gracefully in fast shutdown mode",
                signal.name()
            );
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
            unreachable!()
        }
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ b/pageserver/src/bin/pageserver_binutils.rs
@@ -1,174 +0,0 @@
-//! A helper tool to manage pageserver binary files.
-//! Accepts a file as an argument, attempts to parse it with all ways possible
-//! and prints its interpreted context.
-//!
-//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
-use std::{
-    path::{Path, PathBuf},
-    str::FromStr,
-};
-
-use anyhow::Context;
-use clap::{value_parser, Arg, Command};
-
-use pageserver::{
-    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
-};
-use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-const METADATA_SUBCOMMAND: &str = "metadata";
-
-fn main() -> anyhow::Result<()> {
-    let arg_matches = cli().get_matches();
-
-    match arg_matches.subcommand() {
-        Some((subcommand_name, subcommand_matches)) => {
-            let path = subcommand_matches
-                .get_one::<PathBuf>("metadata_path")
-                .context("'metadata_path' argument is missing")?
-                .to_path_buf();
-            anyhow::ensure!(
-                subcommand_name == METADATA_SUBCOMMAND,
-                "Unknown subcommand {subcommand_name}"
-            );
-            handle_metadata(&path, subcommand_matches)?;
-        }
-        None => {
-            let path = arg_matches
-                .get_one::<PathBuf>("path")
-                .context("'path' argument is missing")?
-                .to_path_buf();
-            println!(
-                "No subcommand specified, attempting to guess the format for file {}",
-                path.display()
-            );
-            if let Err(e) = read_pg_control_file(&path) {
-                println!(
-                    "Failed to read input file as a pg control one: {e:#}\n\
-                    Attempting to read it as layer file"
-                );
-                print_layerfile(&path)?;
-            }
-        }
-    };
-    Ok(())
-}
-
-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
-    println!("{control_file:?}");
-    let control_file_initdb = Lsn(control_file.checkPoint);
-    println!(
-        "pg_initdb_lsn: {}, aligned: {}",
-        control_file_initdb,
-        control_file_initdb.align()
-    );
-    Ok(())
-}
-
-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
-}
-
-fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(path)?;
-    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
-    println!("Current metadata:\n{meta:?}");
-    let mut update_meta = false;
-    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
-        meta = TimelineMetadata::new(
-            Lsn::from_str(disk_consistent_lsn)?,
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            Some(Lsn::from_str(prev_record_lsn)?),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            Lsn::from_str(latest_gc_cuttoff)?,
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-
-    if update_meta {
-        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(path, metadata_bytes)?;
-    }
-
-    Ok(())
-}
-
-fn cli() -> Command {
-    Command::new("Neon Pageserver binutils")
-        .about("Reads pageserver (and related) binary files management utility")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Input file path")
-                .value_parser(value_parser!(PathBuf))
-                .required(false),
-        )
-        .subcommand(
-            Command::new(METADATA_SUBCOMMAND)
-                .about("Read and update pageserver metadata file")
-                .arg(
-                    Arg::new("metadata_path")
-                        .help("Input metadata file path")
-                        .value_parser(value_parser!(PathBuf))
-                        .required(false),
-                )
-                .arg(
-                    Arg::new("disk_consistent_lsn")
-                        .long("disk_consistent_lsn")
-                        .help("Replace disk consistent Lsn"),
-                )
-                .arg(
-                    Arg::new("prev_record_lsn")
-                        .long("prev_record_lsn")
-                        .help("Replace previous record Lsn"),
-                )
-                .arg(
-                    Arg::new("latest_gc_cuttoff")
-                        .long("latest_gc_cuttoff")
-                        .help("Replace latest gc cuttoff"),
-                ),
-        )
-}
-
-#[test]
-fn verify_cli() {
-    cli().debug_assert();
-}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,6 +63,7 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    ///
    /// Default built-in configuration file.
@@ -91,9 +92,10 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

-
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}

+#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -108,7 +110,7 @@ pub mod defaults {

 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
+#gc_feedback = false
 # [remote_storage]

 "###
@@ -187,6 +189,15 @@ pub struct PageServerConf {
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
+
+    /// How long will background tasks be delayed at most after initial load of tenants.
+    ///
+    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
+    /// as we now isolate initial loading, initial logical size calculation and background tasks.
+    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
+    /// has it's initial logical size calculated. Not running background tasks for some seconds is
+    /// not terrible.
+    pub background_task_maximum_delay: Duration,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +270,8 @@ struct PageServerConfigBuilder {
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
+
+    background_task_maximum_delay: BuilderValue<Duration>,
 }

 impl Default for PageServerConfigBuilder {
@@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder {
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
+
+            background_task_maximum_delay: Set(humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
        }
    }
 }
@@ -440,6 +458,10 @@ impl PageServerConfigBuilder {
            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
    }

+    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
+        self.background_task_maximum_delay = BuilderValue::Set(delay);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -522,6 +544,9 @@ impl PageServerConfigBuilder {
                .ok_or(anyhow!(
                    "missing ondemand_download_behavior_treat_error_as_warn"
                ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
        })
    }
 }
@@ -710,6 +735,7 @@ impl PageServerConf {
                    )
                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
+                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -828,6 +854,14 @@ impl PageServerConf {
            )?);
        }

+        if let Some(gc_feedback) = item.get("gc_feedback") {
+            t_conf.gc_feedback = Some(
+                gc_feedback
+                    .as_bool()
+                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
+            );
+        }
+
        Ok(t_conf)
    }

@@ -869,6 +903,7 @@ impl PageServerConf {
            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
+            background_task_maximum_delay: Duration::ZERO,
        }
    }
 }
@@ -1028,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'

 log_format = 'json'
+background_task_maximum_delay = '334 s'

 "#;

@@ -1086,6 +1122,9 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: humantime::parse_duration(
+                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
+                )?,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1140,6 +1179,7 @@ log_format = 'json'
                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: Duration::from_secs(334),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,6 +88,7 @@
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
+#[derive(Clone, Debug)]
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
@@ -95,7 +96,7 @@ pub struct RequestContext {

 /// Desired behavior if the operation requires an on-demand download
 /// to proceed.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DownloadBehavior {
    /// Download the layer file. It can take a while.
    Download,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;

 use crate::{
@@ -82,6 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -98,14 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
        "disk usage based eviction",
        false,
        async move {
-            disk_usage_eviction_task(
-                &state,
-                task_config,
-                storage,
-                &conf.tenants_path(),
-                task_mgr::shutdown_token(),
-            )
-            .await;
+            let cancel = task_mgr::shutdown_token();
+
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
+
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+                .await;
            info!("disk usage based eviction task finishing");
            Ok(())
        },
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -678,6 +678,8 @@ paths:
          application/json:
            schema:
              type: object
+              required:
+                - new_timeline_id
              properties:
                new_timeline_id:
                  type: string
@@ -936,6 +938,8 @@ components:
      allOf:
        - $ref: '#/components/schemas/TenantConfig'
        - type: object
+          required:
+            - new_tenant_id
          properties:
            new_tenant_id:
              type: string
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,3 +1,6 @@
+//!
+//! Management HTTP API
+//!
 use std::collections::HashMap;
 use std::sync::Arc;

@@ -11,7 +14,7 @@ use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::http::endpoint::RequestSpan;
+use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

@@ -25,7 +28,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::mgr::{
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -44,7 +49,6 @@ use utils::{
 };

 // Imports only used for testing APIs
-#[cfg(feature = "testing")]
 use super::models::ConfigureFailpointsRequest;

 struct State {
@@ -144,6 +148,36 @@ impl From<TenantStateError> for ApiError {
    }
 }

+impl From<GetTenantError> for ApiError {
+    fn from(tse: GetTenantError) -> ApiError {
+        match tse {
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            e @ GetTenantError::NotActive(_) => {
+                // Why is this not `ApiError::NotFound`?
+                // Because we must be careful to never return 404 for a tenant if it does
+                // in fact exist locally. If we did, the caller could draw the conclusion
+                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid))
+            }
+            e @ SetNewTenantConfigError::Persist(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
    fn from(value: crate::tenant::DeleteTimelineError) -> Self {
        use crate::tenant::DeleteTimelineError::*;
@@ -163,7 +197,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
        match value {
            // Report Precondition failed so client can distinguish between
            // "tenant is missing" case from "timeline is missing"
-            Tenant(TenantStateError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => {
                ApiError::PreconditionFailed("Requested tenant is missing")
            }
            Tenant(t) => ApiError::from(t),
@@ -258,20 +292,24 @@ async fn build_timeline_info_common(
 }

 // healthcheck handler
-async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;
    let config = get_config(&request);
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let request_data: TimelineCreateRequest = json_request(&mut request).await?;
    check_permission(&request, Some(tenant_id))?;

-    let new_timeline_id = request_data
-        .new_timeline_id
-        .unwrap_or_else(TimelineId::generate);
+    let new_timeline_id = request_data.new_timeline_id;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

@@ -299,11 +337,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
            Err(err) => Err(ApiError::InternalServerError(err)),
        }
    }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
    .await
 }

-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
@@ -337,7 +378,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, response_data)
 }

-async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_detail_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
@@ -372,7 +416,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, timeline_info)
 }

-async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_lsn_by_timestamp_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -396,7 +443,10 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    json_response(StatusCode::OK, result)
 }

-async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_attach_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -432,7 +482,10 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -446,7 +499,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
    json_response(StatusCode::OK, ())
 }

-async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_detach_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -460,7 +516,10 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_load_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -480,7 +539,10 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -493,7 +555,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

-async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

    let response_data = mgr::list_tenants()
@@ -513,7 +578,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
    json_response(StatusCode::OK, response_data)
 }

-async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_status(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -527,7 +595,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        }

        let state = tenant.current_state();
-        Ok(TenantInfo {
+        Result::<_, ApiError>::Ok(TenantInfo {
            id: tenant_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
@@ -535,8 +603,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        })
    }
    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    .await?;

    json_response(StatusCode::OK, tenant_info)
 }
@@ -554,7 +621,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// Note: we don't update the cached size and prometheus metric here.
 /// The retention period might be different, and it's nice to have a method to just calculate it
 /// without modifying anything anyway.
-async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -619,7 +689,10 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    )
 }

-async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_map_info_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
@@ -633,7 +706,10 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
    json_response(StatusCode::OK, layer_map_info)
 }

-async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -656,7 +732,10 @@ async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>
    }
 }

-async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn evict_timeline_layer_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -734,7 +813,12 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
    Ok(response)
 }

-async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
    check_permission(&request, None)?;

    let _timer = STORAGE_TIME_GLOBAL
@@ -742,17 +826,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        .expect("bug")
        .start_timer();

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-
    let tenant_conf =
        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    let target_tenant_id = request_data
-        .new_tenant_id
-        .map(TenantId::from)
-        .unwrap_or_else(TenantId::generate);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

    let state = get_state(&request);

@@ -786,7 +863,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    )
 }

-async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_tenant_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

@@ -812,6 +892,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo

 async fn update_tenant_config_handler(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let request_data: TenantConfigRequest = json_request(&mut request).await?;
    let tenant_id = request_data.tenant_id;
@@ -829,8 +910,10 @@ async fn update_tenant_config_handler(
 }

 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
-#[cfg(feature = "testing")]
-async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_break(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
@@ -842,8 +925,10 @@ async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiErro
    json_response(StatusCode::OK, ())
 }

-#[cfg(feature = "testing")]
-async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
        return Err(ApiError::BadRequest(anyhow!(
            "Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -876,7 +961,10 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }

 // Run GC immediately on given timeline.
-async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_gc_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -895,8 +983,10 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }

 // Run compaction immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -917,8 +1007,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
 }

 // Run checkpoint immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_checkpoint_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -942,6 +1034,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<

 async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -957,6 +1050,7 @@ async fn timeline_download_remote_layers_handler_post(

 async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;
@@ -981,7 +1075,10 @@ async fn active_timeline_of_active_tenant(
        .map_err(ApiError::NotFound)
 }

-async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn always_panic_handler(
+    req: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -992,7 +1089,10 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
    json_response(StatusCode::NO_CONTENT, ())
 }

-async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn disk_usage_eviction_run(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;

    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -1082,8 +1182,10 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    )
 }

-#[cfg(feature = "testing")]
-async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn post_tracing_event_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
    #[derive(Debug, serde::Deserialize)]
    #[serde(rename_all = "lowercase")]
    enum Level {
@@ -1113,6 +1215,85 @@ async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Bod
    json_response(StatusCode::OK, ())
 }

+/// Common functionality of all the HTTP API handlers.
+///
+/// - Adds a tracing span to each request (by `request_span`)
+/// - Logs the request depending on the request method (by `request_span`)
+/// - Logs the response if it was not successful (by `request_span`
+/// - Shields the handler function from async cancellations. Hyper can drop the handler
+///   Future if the connection to the client is lost, but most of the pageserver code is
+///   not async cancellation safe. This converts the dropped future into a graceful cancellation
+///   request with a CancellationToken.
+async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    // Spawn a new task to handle the request, to protect the handler from unexpected
+    // async cancellations. Most pageserver functions are not async cancellation safe.
+    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
+    // with the cancellation token.
+    let token = CancellationToken::new();
+    let cancel_guard = token.clone().drop_guard();
+    let result = request_span(request, move |r| async {
+        let handle = tokio::spawn(
+            async {
+                let token_cloned = token.clone();
+                let result = handler(r, token).await;
+                if token_cloned.is_cancelled() {
+                    info!("Cancelled request finished");
+                }
+                result
+            }
+            .in_current_span(),
+        );
+
+        match handle.await {
+            Ok(result) => result,
+            Err(e) => {
+                // The handler task panicked. We have a global panic handler that logs the
+                // panic with its backtrace, so no need to log that here. Only log a brief
+                // message to make it clear that we returned the error to the client.
+                error!("HTTP request handler task panicked: {e:#}");
+
+                // Don't return an Error here, because then fallback error handler that was
+                // installed in make_router() will print the error. Instead, construct the
+                // HTTP error response and return that.
+                Ok(
+                    ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
+                        .into_response(),
+                )
+            }
+        }
+    })
+    .await;
+
+    cancel_guard.disarm();
+
+    result
+}
+
+/// Like api_handler, but returns an error response if the server is built without
+/// the 'testing' feature.
+async fn testing_api_handler<R, H>(
+    desc: &str,
+    request: Request<Body>,
+    handler: H,
+) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    if cfg!(feature = "testing") {
+        api_handler(request, handler).await
+    } else {
+        std::future::ready(Err(ApiError::BadRequest(anyhow!(
+            "Cannot {desc} because pageserver was compiled without testing APIs",
+        ))))
+        .await
+    }
+}
+
 pub fn make_router(
    conf: &'static PageServerConf,
    launch_ts: &'static LaunchTimestamp,
@@ -1142,26 +1323,6 @@ pub fn make_router(
        .expect("construct launch timestamp header middleware"),
    );

-    macro_rules! testing_api {
-        ($handler_desc:literal, $handler:path $(,)?) => {{
-            #[cfg(not(feature = "testing"))]
-            async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-                Err(ApiError::BadRequest(anyhow!(concat!(
-                    "Cannot ",
-                    $handler_desc,
-                    " because pageserver was compiled without testing APIs",
-                ))))
-            }
-
-            #[cfg(feature = "testing")]
-            let handler = $handler;
-            #[cfg(not(feature = "testing"))]
-            let handler = cfg_disabled;
-
-            move |r| RequestSpan(handler).handle(r)
-        }};
-    }
-
    Ok(router
        .data(Arc::new(
            State::new(
@@ -1173,96 +1334,88 @@ pub fn make_router(
            )
            .context("Failed to initialize router state")?,
        ))
-        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
-        .put(
-            "/v1/failpoints",
-            testing_api!("manage failpoints", failpoints_handler),
-        )
-        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
-        .post("/v1/tenant", |r| {
-            RequestSpan(tenant_create_handler).handle(r)
-        })
-        .get("/v1/tenant/:tenant_id", |r| {
-            RequestSpan(tenant_status).handle(r)
+        .get("/v1/status", |r| api_handler(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
+        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            RequestSpan(tenant_size_handler).handle(r)
+            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
-            RequestSpan(update_tenant_config_handler).handle(r)
+            api_handler(r, update_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            RequestSpan(get_tenant_config_handler).handle(r)
+            api_handler(r, get_tenant_config_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_list_handler).handle(r)
+            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_create_handler).handle(r)
+            api_handler(r, timeline_create_handler)
        })
        .post("/v1/tenant/:tenant_id/attach", |r| {
-            RequestSpan(tenant_attach_handler).handle(r)
+            api_handler(r, tenant_attach_handler)
        })
        .post("/v1/tenant/:tenant_id/detach", |r| {
-            RequestSpan(tenant_detach_handler).handle(r)
+            api_handler(r, tenant_detach_handler)
        })
        .post("/v1/tenant/:tenant_id/load", |r| {
-            RequestSpan(tenant_load_handler).handle(r)
+            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
-            RequestSpan(tenant_ignore_handler).handle(r)
+            api_handler(r, tenant_ignore_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_detail_handler).handle(r)
+            api_handler(r, timeline_detail_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
+            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            RequestSpan(timeline_gc_handler).handle(r)
+            api_handler(r, timeline_gc_handler)
+        })
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
+            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
-            testing_api!("run timeline compaction", timeline_compact_handler),
-        )
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
-            testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
+            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
+            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_delete_handler).handle(r)
+            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            RequestSpan(layer_map_info_handler).handle(r)
+            api_handler(r, layer_map_info_handler)
        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(layer_download_handler).handle(r),
+            |r| api_handler(r, layer_download_handler),
        )
        .delete(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+            |r| api_handler(r, evict_timeline_layer_handler),
        )
        .put("/v1/disk_usage_eviction/run", |r| {
-            RequestSpan(disk_usage_eviction_run).handle(r)
+            api_handler(r, disk_usage_eviction_run)
+        })
+        .put("/v1/tenant/:tenant_id/break", |r| {
+            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
+        })
+        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
+        .post("/v1/tracing/event", |r| {
+            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
-        .put(
-            "/v1/tenant/:tenant_id/break",
-            testing_api!("set tenant state to broken", handle_tenant_break),
-        )
-        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
-        .post(
-            "/v1/tracing/event",
-            testing_api!("emit a tracing event", post_tracing_event_handler),
-        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -35,7 +35,7 @@ use tracing::info;
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
@@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;

+#[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -57,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    // the checkpoint and GC tasks.
    tenant::mgr::shutdown_all_tenants().await;

-    // Stop syncing with remote storage.
-    //
-    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
-    // Should it?
-    task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
-
    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -137,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
    }
 }

+/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
+/// blocking.
+///
+/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
+/// delaying is needed.
+#[derive(Clone)]
+pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until completion.
+    pub initial_tenant_load: Option<utils::completion::Completion>,
+
+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: utils::completion::Completion,
+
+    /// Barrier for when we can start any background jobs.
+    ///
+    /// This can be broken up later on, but right now there is just one class of a background job.
+    pub background_jobs_can_start: utils::completion::Barrier,
+}
+
 #[cfg(test)]
 mod backoff_defaults_tests {
    use super::*;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 // Metrics collected on operations on the storage repository.
 static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
@@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_getpage_get_reconstruct_data_seconds",
+        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_materialized_cache_hits_total",
@@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
    0.001000, // 1000 usec
    0.030,    // 30 ms
    1.000,    // 1000 ms
+    30.000,   // 30000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the WAL redo process",
+        "Time spent waiting for access to the Postgres WAL redo process",
        redo_histogram_time_buckets!(),
    )
    .expect("failed to define a metric")
@@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
        redo_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_bytes_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo sent to Postgres",
        redo_bytes_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
@@ -723,7 +753,9 @@ pub struct TimelineMetrics {
    tenant_id: String,
    timeline_id: String,
    pub reconstruct_time_histo: Histogram,
+    pub get_reconstruct_data_time_histo: Histogram,
    pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
@@ -734,6 +766,7 @@ pub struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
    pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -753,6 +786,9 @@ impl TimelineMetrics {
        let reconstruct_time_histo = RECONSTRUCT_TIME
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
@@ -794,6 +830,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
        let evictions_with_low_residence_duration =
            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

@@ -801,7 +843,9 @@ impl TimelineMetrics {
            tenant_id,
            timeline_id,
            reconstruct_time_histo,
+            get_reconstruct_data_time_histo,
            materialized_page_cache_hit_counter,
+            materialized_page_cache_hit_upon_request_counter,
            flush_time_histo,
            compact_time_histo,
            create_images_time_histo,
@@ -819,6 +863,7 @@ impl TimelineMetrics {
            evictions_with_low_residence_duration: std::sync::RwLock::new(
                evictions_with_low_residence_duration,
            ),
+            read_num_fs_layers,
        }
    }
 }
@@ -828,7 +873,9 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -836,6 +883,8 @@ impl Drop for TimelineMetrics {
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
+
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::mgr;
+use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

@@ -1150,7 +1152,9 @@ enum GetActiveTenantError {
        wait_time: Duration,
    },
    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }

 impl From<GetActiveTenantError> for QueryError {
@@ -1159,7 +1163,8 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::Other(e) => QueryError::Other(e),
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
        }
    }
 }
@@ -1175,13 +1180,16 @@ async fn get_active_tenant_with_timeout(
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
    let tenant = match mgr::get_tenant(tenant_id, false).await {
        Ok(tenant) => tenant,
-        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active=false")
+        }
    };
    let wait_time = Duration::from_secs(30);
    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
        Ok(Ok(())) => Ok(tenant),
        // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
        Err(_) => {
            let latest_state = tenant.current_state();
            if latest_state == TenantState::Active {
@@ -1196,13 +1204,35 @@ async fn get_active_tenant_with_timeout(
    }
 }

+#[derive(Debug, thiserror::Error)]
+enum GetActiveTimelineError {
+    #[error(transparent)]
+    Tenant(GetActiveTenantError),
+    #[error(transparent)]
+    Timeline(anyhow::Error),
+}
+
+impl From<GetActiveTimelineError> for QueryError {
+    fn from(e: GetActiveTimelineError) -> Self {
+        match e {
+            GetActiveTimelineError::Tenant(e) => e.into(),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTenantError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
-    let timeline = tenant.get_timeline(timeline_id, true).await?;
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .await
+        .map_err(GetActiveTimelineError::Timeline)?;
    Ok(timeline)
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -476,18 +476,35 @@ pub async fn shutdown_tasks(
                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
-                victim_tasks.push(Arc::clone(task));
+                victim_tasks.push((
+                    Arc::clone(task),
+                    task.kind,
+                    task_mut.tenant_id,
+                    task_mut.timeline_id,
+                ));
            }
        }
    }

-    for task in victim_tasks {
+    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+
+    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
+            if log_all {
+                if tenant_id.is_none() {
+                    // there are quite few of these
+                    info!(name = task.name, kind = ?task_kind, "stopping global task");
+                } else {
+                    // warn to catch these in tests; there shouldn't be any
+                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                }
+            }
            let completed = tokio::select! {
+                biased;
                _ = &mut join_handle => { true },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -99,6 +99,7 @@ pub struct TenantConf {
    // See the corresponding metric's help string.
    #[serde(with = "humantime_serde")]
    pub evictions_low_residence_duration_metric_threshold: Duration,
+    pub gc_feedback: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub gc_feedback: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -242,6 +247,7 @@ impl TenantConfOpt {
            evictions_low_residence_duration_metric_threshold: self
                .evictions_low_residence_duration_metric_threshold
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
+            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
        }
    }
 }
@@ -278,6 +284,7 @@ impl Default for TenantConf {
                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            gc_feedback: false,
        }
    }
 }
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
                    ))?,
            );
        }
+        tenant_conf.gc_feedback = request_data.gc_feedback;

        Ok(tenant_conf)
    }
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -204,6 +204,35 @@ fn test_off_by_one() {
    assert_eq!(version.image_coverage.query(5), None);
 }

+/// White-box regression test, checking for incorrect removal of node at key.end
+#[test]
+fn test_regression() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 0..5,
+            is_image: false,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 1..2,
+            is_image: false,
+        },
+        "Layer 2".to_string(),
+    );
+
+    // If an insertion operation improperly deletes the endpoint of a previous layer
+    // (which is more likely to happen with layers that collide on key.end), we will
+    // end up with an infinite layer, covering the entire keyspace. Here we assert
+    // that there's no layer at key 100 because we didn't insert any layer there.
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.delta_coverage.query(100), None);
+}
+
 /// Cover edge cases where layers begin or end on the same key
 #[test]
 fn test_key_collision() {
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;

-// TODO the `im` crate has 20x more downloads and also has
-// persistent/immutable BTree. It also runs a bit faster but
-// results are not the same on some tests.
+// NOTE the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. But it's bugged so rpds is a
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;

 /// Data structure that can efficiently:
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
 /// - iterate the latest layers in a key range
 /// - insert layers in non-decreasing lsn.start order
 ///
-/// The struct is parameterized over Value for easier
-/// testing, but in practice it's some sort of layer.
+/// For a detailed explanation and justification of this approach, see:
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
+///
+/// NOTE The struct is parameterized over Value for easier
+///      testing, but in practice it's some sort of layer.
 pub struct LayerCoverage<Value> {
    /// For every change in coverage (as we sweep the key space)
    /// we store (lsn.end, value).
    ///
-    /// We use an immutable/persistent tree so that we can keep historic
-    /// versions of this coverage without cloning the whole thing and
-    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    /// NOTE We use an immutable/persistent tree so that we can keep historic
+    ///      versions of this coverage without cloning the whole thing and
+    ///      incurring quadratic memory cost. See HistoricLayerCoverage.
    ///
-    /// We use the Sync version of the map because we want Self to
-    /// be Sync. Using nonsync might be faster, if we can work with
-    /// that.
+    /// NOTE We use the Sync version of the map because we want Self to
+    ///      be Sync. Using nonsync might be faster, if we can work with
+    ///      that.
    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
 }

@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {

    /// Helper function to subdivide the key range without changing any values
    ///
+    /// This operation has no semantic effect by itself. It only helps us pin in
+    /// place the part of the coverage we don't want to change when inserting.
+    ///
+    /// As an analogy, think of a polygon. If you add a vertex along one of the
+    /// segments, the polygon is still the same, but it behaves differently when
+    /// we move or delete one of the other points.
+    ///
    /// Complexity: O(log N)
    fn add_node(&mut self, key: i128) {
        let value = match self.nodes.range(..=key).last() {
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
        let mut to_update = Vec::new();
        let mut to_remove = Vec::new();
        let mut prev_covered = false;
-        for (k, node) in self.nodes.range(key.clone()) {
+        for (k, node) in self.nodes.range(key) {
            let needs_cover = match node {
                None => true,
                Some((h, _)) => h < &lsn.end,
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
            }
            prev_covered = needs_cover;
        }
-        if !prev_covered {
-            to_remove.push(key.end);
-        }
+        // TODO check if the nodes inserted at key.start and key.end are safe
+        //      to remove. It's fine to keep them but they could be redundant.
        for k in to_update {
            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
        }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -10,6 +10,7 @@ use tokio::fs;
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
+use tokio::task::JoinSet;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
@@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::IGNORED_TENANT_FILE_NAME;
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
@@ -63,6 +64,7 @@ pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
    let tenants_dir = conf.tenants_path();
@@ -119,6 +121,7 @@ pub async fn init_tenant_mgr(
                        &tenant_dir_path,
                        broker_client.clone(),
                        remote_storage.clone(),
+                        Some(init_order.clone()),
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -154,6 +157,7 @@ pub fn schedule_local_tenant_processing(
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
+    init_order: Option<InitializationOrder>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -207,7 +211,14 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_order,
+            ctx,
+        )
    };
    Ok(tenant)
 }
@@ -222,6 +233,7 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
+#[instrument]
 pub async fn shutdown_all_tenants() {
    // Prevent new tenants from being created.
    let tenants_to_shut_down = {
@@ -238,39 +250,51 @@ pub async fn shutdown_all_tenants() {
                tenants_clone
            }
            TenantsMap::ShuttingDown(_) => {
+                // TODO: it is possible that detach and shutdown happen at the same time. as a
+                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

-    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
-    for (_, tenant) in tenants_to_shut_down {
-        if tenant.is_active() {
-            // updates tenant state, forbidding new GC and compaction iterations from starting
-            tenant.set_stopping().await;
-            tenants_to_freeze_and_flush.push(tenant);
+    let mut join_set = JoinSet::new();
+    for (tenant_id, tenant) in tenants_to_shut_down {
+        join_set.spawn(
+            async move {
+                let freeze_and_flush = true;
+
+                match tenant.shutdown(freeze_and_flush).await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
+                    }
+                }
+            }
+            .instrument(info_span!("shutdown", %tenant_id)),
+        );
+    }
+
+    let mut panicked = 0;
+
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
+            }
        }
    }

-    // Shut down all existing walreceiver connections and stop accepting the new ones.
-    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
-
-    // Ok, no background tasks running anymore. Flush any remaining data in
-    // memory to disk.
-    //
-    // We assume that any incoming connections that might request pages from
-    // the tenant have already been terminated by the caller, so there
-    // should be no more activity in any of the repositories.
-    //
-    // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_freeze_and_flush {
-        let tenant_id = tenant.tenant_id();
-        debug!("shutdown tenant {tenant_id}");
-
-        if let Err(err) = tenant.freeze_and_flush().await {
-            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
-        }
+    if panicked > 0 {
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
 }

@@ -291,7 +315,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -304,11 +328,19 @@ pub async fn create_tenant(
    }).await
 }

+#[derive(Debug, thiserror::Error)]
+pub enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+}
+
 pub async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
+) -> Result<(), SetNewTenantConfigError> {
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

@@ -318,23 +350,32 @@ pub async fn set_new_tenant_config(
        &tenant_config_path,
        new_tenant_conf,
        false,
-    )?;
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf).await;
    Ok(())
 }

+#[derive(Debug, thiserror::Error)]
+pub enum GetTenantError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+}
+
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 pub async fn get_tenant(
    tenant_id: TenantId,
    active_only: bool,
-) -> Result<Arc<Tenant>, TenantStateError> {
+) -> Result<Arc<Tenant>, GetTenantError> {
    let m = TENANTS.read().await;
    let tenant = m
        .get(&tenant_id)
-        .ok_or(TenantStateError::NotFound(tenant_id))?;
+        .ok_or(GetTenantError::NotFound(tenant_id))?;
    if active_only && !tenant.is_active() {
-        Err(TenantStateError::NotActive(tenant_id))
+        Err(GetTenantError::NotActive(tenant_id))
    } else {
        Ok(Arc::clone(tenant))
    }
@@ -343,7 +384,7 @@ pub async fn get_tenant(
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
-    Tenant(#[from] TenantStateError),
+    Tenant(#[from] GetTenantError),

    #[error("Timeline {0}")]
    Timeline(#[from] crate::tenant::DeleteTimelineError),
@@ -420,7 +461,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -493,7 +534,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -569,26 +610,26 @@ where
    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
    // avoid holding the lock for the entire process.
-    {
-        let tenants_accessor = TENANTS.write().await;
-        match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => match tenant.current_state() {
-                TenantState::Attaching
-                | TenantState::Loading
-                | TenantState::Activating
-                | TenantState::Broken { .. }
-                | TenantState::Active => tenant.set_stopping().await,
-                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
-            },
-            None => return Err(TenantStateError::NotFound(tenant_id)),
+    let tenant = {
+        TENANTS
+            .write()
+            .await
+            .get(&tenant_id)
+            .cloned()
+            .ok_or(TenantStateError::NotFound(tenant_id))?
+    };
+
+    let freeze_and_flush = false;
+
+    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
+    // that we can continue safely to cleanup.
+    match tenant.shutdown(freeze_and_flush).await {
+        Ok(()) => {}
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
        }
    }

-    // shutdown all tenant and timeline tasks: gc, compaction, page service)
-    // No new tasks will be started for this tenant because it's in `Stopping` state.
-    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
-    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
-
    match tenant_cleanup
        .await
        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -670,7 +711,6 @@ pub async fn immediate_gc(
    Ok(wait_task_done)
 }

-#[cfg(feature = "testing")]
 pub async fn immediate_compact(
    tenant_id: TenantId,
    timeline_id: TimelineId,
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
    Ok(())
 }

-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
-    const PARALLEL_PATH_THRESHOLD: usize = 1;
-    if paths.len() <= PARALLEL_PATH_THRESHOLD {
-        for path in paths {
-            fsync_path(path)?;
-        }
-        return Ok(());
-    }
+fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.

    /// Use at most this number of threads.
    /// Increasing this limit will
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
    let num_threads = paths.len().min(MAX_NUM_THREADS);
    let next_path_idx = AtomicUsize::new(0);

-    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+    std::thread::scope(|s| -> io::Result<()> {
        let mut handles = vec![];
        // Spawn `num_threads - 1`, as the current thread is also a worker.
        for _ in 1..num_threads {
-            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
        }

        parallel_worker(paths, &next_path_idx)?;
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {

        Ok(())
    })
-    .unwrap()
+}
+
+/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    if paths.len() == 1 {
+        fsync_path(&paths[0])?;
+        return Ok(());
+    }
+
+    fsync_in_thread_pool(paths)
+}
+
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+    const MAX_CONCURRENT_FSYNC: usize = 64;
+    let mut next = paths.iter().peekable();
+    let mut js = tokio::task::JoinSet::new();
+    loop {
+        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
+            let next = next.next().expect("just peeked");
+            let next = next.to_owned();
+            js.spawn_blocking(move || fsync_path(&next));
+        }
+
+        // now the joinset has been filled up, wait for next to complete
+        if let Some(res) = js.join_next().await {
+            res??;
+        } else {
+            // last item had already completed
+            assert!(
+                next.peek().is_none(),
+                "joinset emptied, we shouldn't have more work"
+            );
+            return Ok(());
+        }
+    }
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
@@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub use layer_desc::PersistentLayerDesc;
 pub use remote_layer::RemoteLayer;

 use super::layer_map::BatchedUpdates;
@@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer {
-    fn get_tenant_id(&self) -> TenantId;
+    /// Get the layer descriptor.
+    fn layer_desc(&self) -> &PersistentLayerDesc;
+
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }

    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId;
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }

    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName;
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }

    // Path to the layer file in the local filesystem.
    // `None` for `RemoteLayer`.
@@ -542,7 +553,7 @@ impl From<LayerFileName> for LayerDescriptor {
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
 /// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
 /// struct for a file on disk, without having a page server running, so that we have no
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -56,8 +56,8 @@ use utils::{
 };

 use super::{
-    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, PathOrConf,
+    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
-            lsn_range: layer.lsn_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),

            index_start_blk: 0,
            index_root_blk: 0,
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
 /// reading/deserializing records themselves.
 ///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
-struct BlobRef(u64);
+pub struct BlobRef(pub u64);

 impl BlobRef {
    pub fn will_init(&self) -> bool {
@@ -180,10 +180,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
    path_or_conf: PathOrConf,

-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
+    pub desc: PersistentLayerDesc,

    pub file_size: u64,

@@ -197,8 +194,8 @@ impl std::fmt::Debug for DeltaLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("DeltaLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
-            .field("lsn_range", &self.lsn_range)
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
+            .field("lsn_range", &self.desc.lsn_range)
            .field("file_size", &self.file_size)
            .field("inner", &self.inner)
            .finish()
@@ -228,30 +225,16 @@ impl std::fmt::Debug for DeltaLayerInner {
 }

 impl Layer for DeltaLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -324,10 +307,10 @@ impl Layer for DeltaLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.lsn_range.start);
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
        let mut need_image = true;

-        ensure!(self.key_range.contains(&key));
+        ensure!(self.desc.key_range.contains(&key));

        {
            // Open the file and lock the metadata in memory
@@ -402,19 +385,31 @@ impl Layer for DeltaLayer {
            Ok(ValueReconstructResult::Complete)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for DeltaLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
-
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -602,10 +597,12 @@ impl DeltaLayer {
    ) -> DeltaLayer {
        DeltaLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
-            lsn_range: filename.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+            ),
            file_size,
            access_stats,
            inner: RwLock::new(DeltaLayerInner {
@@ -619,7 +616,7 @@ impl DeltaLayer {

    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
@@ -632,10 +629,12 @@ impl DeltaLayer {

        Ok(DeltaLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
-            lsn_range: summary.lsn_range,
+            desc: PersistentLayerDesc::new_delta(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn_range,
+            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
@@ -648,18 +647,14 @@ impl DeltaLayer {
    }

    fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        }
+        self.desc.delta_file_name()
    }
-
    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -803,10 +798,12 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let layer = DeltaLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            tenant_id: self.tenant_id,
-            timeline_id: self.timeline_id,
-            key_range: self.key_start..key_end,
-            lsn_range: self.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+            ),
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: RwLock::new(DeltaLayerInner {
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -9,6 +9,8 @@ use std::str::FromStr;

 use utils::lsn::Lsn;

+use super::PersistentLayerDesc;
+
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
@@ -153,7 +155,7 @@ impl Ord for ImageFileName {
 impl ImageFileName {
    pub fn lsn_as_range(&self) -> Range<Lsn> {
        // Saves from having to copypaste this all over
-        self.lsn..(self.lsn + 1)
+        PersistentLayerDesc::image_layer_lsn_range(self.lsn)
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -52,8 +52,8 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
+use super::filename::ImageFileName;
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
            lsn: layer.lsn,

            index_start_blk: 0,
@@ -104,14 +104,13 @@ impl From<&ImageLayer> for Summary {
 /// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
    path_or_conf: PathOrConf,
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub file_size: u64,

-    // This entry contains an image of all pages as of this LSN
+    pub desc: PersistentLayerDesc,
+    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
    pub lsn: Lsn,

+    pub file_size: u64,
+
    access_stats: LayerAccessStats,

    inner: RwLock<ImageLayerInner>,
@@ -122,7 +121,7 @@ impl std::fmt::Debug for ImageLayer {
        use super::RangeDisplayDebug;

        f.debug_struct("ImageLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
            .field("file_size", &self.file_size)
            .field("lsn", &self.lsn)
            .field("inner", &self.inner)
@@ -153,27 +152,15 @@ impl std::fmt::Debug for ImageLayerInner {
 }

 impl Layer for ImageLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
            "----- image layer for ten {} tli {} key {}-{} at {} ----",
-            self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn
        );

        if !verbose {
@@ -203,7 +190,7 @@ impl Layer for ImageLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.key_range.contains(&key));
+        assert!(self.desc.key_range.contains(&key));
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

@@ -230,24 +217,37 @@ impl Layer for ImageLayer {
            Ok(ValueReconstructResult::Missing)
        }
    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }

 impl PersistentLayer for ImageLayer {
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
        Some(self.path())
    }

-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
        unimplemented!();
    }
@@ -405,9 +405,13 @@ impl ImageLayer {
    ) -> ImageLayer {
        ImageLayer {
            path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: filename.lsn,
            file_size,
            access_stats,
@@ -422,7 +426,7 @@ impl ImageLayer {

    /// Create an ImageLayer struct representing an existing file on disk.
    ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
        let mut summary_buf = Vec::new();
        summary_buf.resize(PAGE_SZ, 0);
@@ -433,9 +437,13 @@ impl ImageLayer {
            .context("get file metadata to determine size")?;
        Ok(ImageLayer {
            path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
+            desc: PersistentLayerDesc::new_img(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -449,18 +457,15 @@ impl ImageLayer {
    }

    fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            key_range: self.key_range.clone(),
-            lsn: self.lsn,
-        }
+        self.desc.image_file_name()
    }

    /// Path to the layer file in pageserver workdir.
    pub fn path(&self) -> PathBuf {
        Self::path_for(
            &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
            &self.layer_name(),
        )
    }
@@ -484,6 +489,7 @@ struct ImageLayerWriterInner {
    tenant_id: TenantId,
    key_range: Range<Key>,
    lsn: Lsn,
+    is_incremental: bool,

    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -499,6 +505,7 @@ impl ImageLayerWriterInner {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<Self> {
        // Create the file initially with a temporary filename.
        // We'll atomically rename it to the final name when we're done.
@@ -533,6 +540,7 @@ impl ImageLayerWriterInner {
            lsn,
            tree: tree_builder,
            blob_writer,
+            is_incremental,
        };

        Ok(writer)
@@ -570,6 +578,14 @@ impl ImageLayerWriterInner {
            file.write_all(buf.as_ref())?;
        }

+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+        );
+
        // Fill in the summary on blk 0
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
@@ -593,9 +609,7 @@ impl ImageLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let layer = ImageLayer {
            path_or_conf: PathOrConf::Conf(self.conf),
-            timeline_id: self.timeline_id,
-            tenant_id: self.tenant_id,
-            key_range: self.key_range.clone(),
+            desc,
            lsn: self.lsn,
            file_size: metadata.len(),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -667,6 +681,7 @@ impl ImageLayerWriter {
        tenant_id: TenantId,
        key_range: &Range<Key>,
        lsn: Lsn,
+        is_incremental: bool,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(ImageLayerWriterInner::new(
@@ -675,6 +690,7 @@ impl ImageLayerWriter {
                tenant_id,
                key_range,
                lsn,
+                is_incremental,
            )?),
        })
    }
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -0,0 +1,109 @@
+use std::ops::Range;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::repository::Key;
+
+use super::{DeltaFileName, ImageFileName, LayerFileName};
+
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// a unified way to generate layer information like file name.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PersistentLayerDesc {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    /// For image layer, this is `[lsn, lsn+1)`.
+    pub lsn_range: Range<Lsn>,
+    /// Whether this is a delta layer.
+    pub is_delta: bool,
+    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
+    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
+    /// incremental.
+    pub is_incremental: bool,
+}
+
+impl PersistentLayerDesc {
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+
+    pub fn new_img(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+        is_incremental: bool,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range: Self::image_layer_lsn_range(lsn),
+            is_delta: false,
+            is_incremental,
+        }
+    }
+
+    pub fn new_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range,
+            is_delta: true,
+            is_incremental: true,
+        }
+    }
+
+    /// Get the LSN that the image layer covers.
+    pub fn image_layer_lsn(&self) -> Lsn {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        self.lsn_range.start
+    }
+
+    /// Get the LSN range corresponding to a single image layer LSN.
+    pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
+        lsn..(lsn + 1)
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not a delta layer.
+    pub fn delta_file_name(&self) -> DeltaFileName {
+        assert!(self.is_delta);
+        DeltaFileName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        }
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not an image layer, or the lsn range is invalid
+    pub fn image_file_name(&self) -> ImageFileName {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        ImageFileName {
+            key_range: self.key_range.clone(),
+            lsn: self.lsn_range.start,
+        }
+    }
+
+    pub fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            self.delta_file_name().into()
+        } else {
+            self.image_file_name().into()
+        }
+    }
+}
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -18,11 +18,10 @@ use utils::{
    lsn::Lsn,
 };

-use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
-use super::image_layer::ImageLayer;
+use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer,
+    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -34,19 +33,10 @@ use super::{
 ///
 /// See: [`crate::context::RequestContext`] for authorization to download
 pub struct RemoteLayer {
-    tenantid: TenantId,
-    timelineid: TimelineId,
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-
-    pub file_name: LayerFileName,
+    pub desc: PersistentLayerDesc,

    pub layer_metadata: LayerFileMetadata,

-    is_delta: bool,
-
-    is_incremental: bool,
-
    access_stats: LayerAccessStats,

    pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -66,22 +56,14 @@ pub struct RemoteLayer {
 impl std::fmt::Debug for RemoteLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("RemoteLayer")
-            .field("file_name", &self.file_name)
+            .field("file_name", &self.desc.filename())
            .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.is_incremental)
+            .field("is_incremental", &self.desc.is_incremental)
            .finish()
    }
 }

 impl Layer for RemoteLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-
    fn get_value_reconstruct_data(
        &self,
        _key: Key,
@@ -95,53 +77,45 @@ impl Layer for RemoteLayer {
        );
    }

-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenantid,
-            self.timelineid,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
        );

        Ok(())
    }

+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
    fn short_id(&self) -> String {
-        self.filename().file_name()
+        self.layer_desc().short_id()
    }
 }

 impl PersistentLayer for RemoteLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenantid
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timelineid
-    }
-
-    fn filename(&self) -> LayerFileName {
-        if self.is_delta {
-            DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            }
-            .into()
-        } else {
-            ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            }
-            .into()
-        }
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
    }

    fn local_path(&self) -> Option<PathBuf> {
@@ -176,7 +150,7 @@ impl PersistentLayer for RemoteLayer {
        let layer_file_name = self.filename().file_name();
        let lsn_range = self.get_lsn_range();

-        if self.is_delta {
+        if self.desc.is_delta {
            HistoricLayerInfo::Delta {
                layer_file_name,
                layer_file_size: self.layer_metadata.file_size(),
@@ -210,13 +184,13 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_as_range(),
-            is_delta: false,
-            is_incremental: false,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                false,
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -232,13 +206,12 @@ impl RemoteLayer {
        access_stats: LayerAccessStats,
    ) -> RemoteLayer {
        RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_range.clone(),
-            is_delta: true,
-            is_incremental: true,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+            ),
            layer_metadata: layer_metadata.clone(),
            ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -256,15 +229,12 @@ impl RemoteLayer {
    where
        L: ?Sized + Layer,
    {
-        if self.is_delta {
-            let fname = DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            };
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
            Arc::new(DeltaLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
@@ -273,14 +243,11 @@ impl RemoteLayer {
                ),
            ))
        } else {
-            let fname = ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            };
+            let fname = self.desc.image_file_name();
            Arc::new(ImageLayer::new(
                conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                &fname,
                file_size,
                self.access_stats.clone_for_residence_change(
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,13 +9,17 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantId;
+use utils::completion;

-pub fn start_background_loops(tenant_id: TenantId) {
+/// Start per tenant background loops: compaction and gc.
+pub fn start_background_loops(
+    tenant: &Arc<Tenant>,
+    background_jobs_can_start: Option<&completion::Barrier>,
+) {
+    let tenant_id = tenant.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
@@ -23,11 +27,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("compactor for tenant {tenant_id}"),
        false,
-        async move {
-            compaction_loop(tenant_id)
-                .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                compaction_loop(tenant, cancel)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
    task_mgr::spawn(
@@ -37,11 +50,20 @@ pub fn start_background_loops(tenant_id: TenantId) {
        None,
        &format!("garbage collector for tenant {tenant_id}"),
        false,
-        async move {
-            gc_loop(tenant_id)
-                .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            let background_jobs_can_start = background_jobs_can_start.cloned();
+            async move {
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                gc_loop(tenant, cancel)
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
        },
    );
 }
@@ -49,27 +71,26 @@ pub fn start_background_loops(tenant_id: TenantId) {
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant_id: TenantId) {
+async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
-        let cancel = task_mgr::shutdown_token();
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_compaction_period();

@@ -119,29 +140,29 @@ async fn compaction_loop(tenant_id: TenantId) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant_id: TenantId) {
+async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
-        let cancel = task_mgr::shutdown_token();
        // GC might require downloading, to find the cutoff LSN that corresponds to the
        // cutoff specified as time.
-        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let ctx =
+            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
            trace!("waking up");

-            let tenant = tokio::select! {
+            tokio::select! {
                _ = cancel.cancelled() => {
                    info!("received cancellation request");
                    return;
                },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                    ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                },
-            };
+            }

            let period = tenant.get_gc_period();

@@ -161,7 +182,9 @@ async fn gc_loop(tenant_id: TenantId) {
                Duration::from_secs(10)
            } else {
                // Run gc
-                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .await;
                if let Err(e) = res {
                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
                    wait_duration
@@ -187,23 +210,10 @@ async fn gc_loop(tenant_id: TenantId) {
    trace!("GC loop stopped.");
 }

-async fn wait_for_active_tenant(
-    tenant_id: TenantId,
-    wait: Duration,
-) -> ControlFlow<(), Arc<Tenant>> {
-    let tenant = loop {
-        match mgr::get_tenant(tenant_id, false).await {
-            Ok(tenant) => break tenant,
-            Err(e) => {
-                error!("Failed to get a tenant {tenant_id}: {e:#}");
-                tokio::time::sleep(wait).await;
-            }
-        }
-    };
-
+async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
    // if the tenant has a proper status already, no need to wait for anything
    if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(tenant)
+        ControlFlow::Continue(())
    } else {
        let mut tenant_state_updates = tenant.subscribe_for_state_updates();
        loop {
@@ -213,7 +223,7 @@ async fn wait_for_active_tenant(
                    match new_state {
                        TenantState::Active => {
                            debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(tenant);
+                            return ControlFlow::Continue(());
                        }
                        state => {
                            debug!("Not running the task loop, tenant is not active: {state:?}");
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -57,6 +57,7 @@ use pageserver_api::reltag::RelTag;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::to_pg_timestamp;
 use utils::{
+    completion,
    id::{TenantId, TimelineId},
    lsn::{AtomicLsn, Lsn, RecordLsn},
    seqwait::SeqWait,
@@ -119,7 +120,7 @@ pub struct Timeline {

    pub pg_version: u32,

-    pub(crate) layers: tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>,
+    pub(crate) layers: Arc<tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>>,

    /// Set of key ranges which should be covered by image layers to
    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -195,8 +196,9 @@ pub struct Timeline {
    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`].
-    pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
+    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,

    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -235,7 +237,18 @@ pub struct Timeline {

    state: watch::Sender<TimelineState>,

+    /// Prevent two tasks from deleting the timeline at the same time. If held, the
+    /// timeline is being deleted. If 'true', the timeline has already been deleted.
+    pub delete_lock: tokio::sync::Mutex<bool>,
+
    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
+
+    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
+    initial_logical_size_can_start: Option<completion::Barrier>,
+
+    /// Completion shared between all timelines loaded during startup; used to delay heavier
+    /// background tasks until some logical sizes have been calculated.
+    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }

 type LayerMapWriteLockGuard<'t> = tokio::sync::RwLockWriteGuard<'t, LayerMap<dyn PersistentLayer>>;
@@ -522,7 +535,12 @@ impl Timeline {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => {
+                        self.metrics
+                            .materialized_page_cache_hit_upon_request_counter
+                            .inc();
+                        return Ok(cached_img); // exact LSN match, return the image
+                    }
                    Ordering::Greater => {
                        unreachable!("the returned lsn should never be after the requested lsn")
                    }
@@ -537,8 +555,10 @@ impl Timeline {
            img: cached_page_img,
        };

+        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
+        timer.stop_and_record();

        self.metrics
            .reconstruct_time_histo
@@ -624,7 +644,7 @@ impl Timeline {
        {
            Ok(()) => Ok(()),
            Err(e) => {
-                // walreceiver.status() locks internally, don't count that towards the wait_lsn_time_histo
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                drop(_timer);
                let walreceiver_status = {
                    match &*self.walreceiver.lock().unwrap() {
@@ -671,7 +691,7 @@ impl Timeline {
    }

    /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
        const ROUNDS: usize = 2;

        let last_record_lsn = self.get_last_record_lsn();
@@ -760,7 +780,7 @@ impl Timeline {
    }

    /// Compaction which might need to be retried after downloading remote layers.
-    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
+    async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
        //
        // High level strategy for compaction / image creation:
        //
@@ -795,7 +815,7 @@ impl Timeline {
        // Below are functions compact_level0() and create_image_layers()
        // but they are a bit ad hoc and don't quite work like it's explained
        // above. Rewrite it.
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        let state = *self.state.borrow();
        if state == TimelineState::Stopping {
@@ -829,7 +849,7 @@ impl Timeline {

                // 3. Compact
                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(&layer_removal_cs, target_file_size, ctx)
+                self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
                    .await?;
                timer.stop_and_record();
            }
@@ -918,10 +938,15 @@ impl Timeline {
        Ok(())
    }

-    pub fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+    pub fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        background_jobs_can_start: Option<&completion::Barrier>,
+        ctx: &RequestContext,
+    ) {
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
-        self.launch_eviction_task();
+        self.launch_eviction_task(background_jobs_can_start);
    }

    pub fn set_state(&self, new_state: TimelineState) {
@@ -939,6 +964,14 @@ impl Timeline {
                error!("Not activating a Stopping timeline");
            }
            (_, new_state) => {
+                if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) {
+                    // drop the copmletion guard, if any; it might be holding off the completion
+                    // forever needlessly
+                    self.initial_logical_size_attempt
+                        .lock()
+                        .unwrap_or_else(|e| e.into_inner())
+                        .take();
+                }
                self.state.send_replace(new_state);
            }
        }
@@ -1288,6 +1321,13 @@ impl Timeline {
            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
    }

+    fn get_gc_feedback(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .gc_feedback
+            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
+    }
+
    pub(super) fn tenant_conf_updated(&self) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -1322,6 +1362,8 @@ impl Timeline {
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        remote_client: Option<RemoteTimelineClient>,
        pg_version: u32,
+        initial_logical_size_can_start: Option<completion::Barrier>,
+        initial_logical_size_attempt: Option<completion::Completion>,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(TimelineState::Loading);
@@ -1346,7 +1388,7 @@ impl Timeline {
                timeline_id,
                tenant_id,
                pg_version,
-                layers: tokio::sync::RwLock::new(LayerMap::default()),
+                layers: Arc::new(tokio::sync::RwLock::new(LayerMap::default())),
                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
@@ -1415,6 +1457,10 @@ impl Timeline {
                eviction_task_timeline_state: tokio::sync::Mutex::new(
                    EvictionTaskTimelineState::default(),
                ),
+                delete_lock: tokio::sync::Mutex::new(false),
+
+                initial_logical_size_can_start,
+                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
            };
            result.repartition_threshold = result.get_checkpoint_distance() / 10;
            result
@@ -1903,9 +1949,30 @@ impl Timeline {
            false,
            // NB: don't log errors here, task_mgr will do that.
            async move {
-                // no cancellation here, because nothing really waits for this to complete compared
+
+                let cancel = task_mgr::shutdown_token();
+
+                // in case we were created during pageserver initialization, wait for
+                // initialization to complete before proceeding. startup time init runs on the same
+                // runtime.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
+                };
+
+                // hold off background tasks from starting until all timelines get to try at least
+                // once initial logical size calculation; though retry will rarely be useful.
+                // holding off is done because heavier tasks execute blockingly on the same
+                // runtime.
+                //
+                // dropping this at every outcome is probably better than trying to cling on to it,
+                // delay will be terminated by a timeout regardless.
+                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+
+                // no extra cancellation here, because nothing really waits for this to complete compared
                // to spawn_ondemand_logical_size_calculation.
                let cancel = CancellationToken::new();
+
                let calculated_size = match self_clone
                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
@@ -2170,7 +2237,7 @@ impl Timeline {
    fn delete_historic_layer(
        &self,
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
    ) -> anyhow::Result<()> {
@@ -2249,6 +2316,9 @@ impl Timeline {
        let mut timeline_owned;
        let mut timeline = self;

+        let mut read_count =
+            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+
        // For debugging purposes, collect the path of layers that we traversed
        // through. It's included in the error message if we fail to find the key.
        let mut traversal_path = Vec::<TraversalPathItem>::new();
@@ -2383,6 +2453,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2409,6 +2480,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2443,6 +2515,7 @@ impl Timeline {
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
+                            *read_count += 1;
                            traversal_path.push((
                                result,
                                cont_lsn,
@@ -2508,7 +2581,7 @@ impl Timeline {
                    (DownloadBehavior::Error, false) => {
                        return Err(PageReconstructError::NeedsDownload(
                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
-                            remote_layer.file_name.clone(),
+                            remote_layer.filename(),
                        ))
                    }
                }
@@ -2654,7 +2727,7 @@ impl Timeline {

    /// Layer flusher task's main loop.
    async fn flush_loop(
-        &self,
+        self: &Arc<Self>,
        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
        ctx: &RequestContext,
    ) {
@@ -2743,9 +2816,9 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
    async fn flush_frozen_layer(
-        &self,
+        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -2869,26 +2942,41 @@ impl Timeline {

    // Write out the given frozen in-memory layer as a new L0 delta file
    async fn create_delta_layer(
-        &self,
+        self: &Arc<Self>,
        frozen_layer: &InMemoryLayer,
    ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
-        // Write it out
-        let new_delta = frozen_layer.write_to_disk()?;
-        let new_delta_path = new_delta.path();
-        let new_delta_filename = new_delta.filename();
+        // TODO figure out how to use spawn_blocking. Can't use it because frozen_layer is not 'static
+        let (new_delta, sz): (DeltaLayer, _) = tokio::task::block_in_place({
+            let self_clone = Arc::clone(self);
+            move || {
+                // Write it out
+                let new_delta = frozen_layer.write_to_disk()?;
+                let new_delta_path = new_delta.path();

-        // Sync it to disk.
-        //
-        // We must also fsync the timeline dir to ensure the directory entries for
-        // new layer files are durable
-        //
-        // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
-        // files to flush, it might be better to first write them all, and then fsync
-        // them all in parallel.
-        par_fsync::par_fsync(&[
-            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-        ])?;
+                // Sync it to disk.
+                //
+                // We must also fsync the timeline dir to ensure the directory entries for
+                // new layer files are durable
+                //
+                // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
+                // files to flush, it might be better to first write them all, and then fsync
+                // them all in parallel.
+
+                // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
+                // this with a single fsync in future refactors.
+                par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
+                // Then sync the parent directory.
+                par_fsync::par_fsync(&[self_clone
+                    .conf
+                    .timeline_path(&self_clone.timeline_id, &self_clone.tenant_id)])
+                .context("fsync of timeline dir")?;
+
+                let sz = new_delta_path.metadata()?.len();
+
+                anyhow::Ok((new_delta, sz))
+            }
+        })?;
+        let new_delta_name = new_delta.filename();

        // Add it to the layer map
        let l = Arc::new(new_delta);
@@ -2903,14 +2991,12 @@ impl Timeline {
        batch_updates.flush();

        // update the timeline's physical size
-        let sz = new_delta_path.metadata()?.len();
-
        self.metrics.resident_physical_size_gauge.add(sz);
        // update metrics
        self.metrics.num_persistent_files_created.inc_by(1);
        self.metrics.persistent_bytes_written.inc_by(sz);

-        Ok((new_delta_filename, LayerFileMetadata::new(sz)))
+        Ok((new_delta_name, LayerFileMetadata::new(sz)))
    }

    async fn repartition(
@@ -3053,6 +3139,7 @@ impl Timeline {
                    self.tenant_id,
                    &img_range,
                    lsn,
+                    false, // image layer always covers the full range
                )?;

                fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -3116,17 +3203,22 @@ impl Timeline {
        let all_paths = image_layers
            .iter()
            .map(|layer| layer.path())
-            .chain(std::iter::once(
-                self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            ))
            .collect::<Vec<_>>();
-        par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&all_paths)
+            .await
+            .context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .await
+            .context("fsync of timeline dir")?;

        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());

        let mut layers = self.layers.write().await;
        let mut updates = layers.batch_update();
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+
        for l in image_layers {
            let path = l.filename();
            let metadata = timeline_path
@@ -3185,13 +3277,13 @@ impl Timeline {
    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
    /// start of level0 files compaction, the on-demand download should be revisited as well.
-    async fn compact_level0_phase1(
+    fn compact_level0_phase1(
        &self,
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layers: tokio::sync::OwnedRwLockReadGuard<LayerMap<dyn PersistentLayer>>,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        let layers = self.layers.read().await;
        let mut level0_deltas = layers.get_level0_deltas()?;

        // Only compact if enough layers have accumulated.
@@ -3498,13 +3590,13 @@ impl Timeline {
        if !new_layers.is_empty() {
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

-            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
-
            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;

+            par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+                .context("fsync of timeline dir")?;
+
            layer_paths.pop().unwrap();
        }

@@ -3521,17 +3613,27 @@ impl Timeline {
    /// as Level 1 files.
    ///
    async fn compact_level0(
-        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        self: &Arc<Self>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
+        let this = self.clone();
+        let ctx_inner = ctx.clone();
+        let layer_removal_cs_inner = layer_removal_cs.clone();
+        let layers = Arc::clone(&self.layers).read_owned().await;
+        let span = tracing::info_span!("blocking");
        let CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact,
-        } = self
-            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
-            .await?;
+        } = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
+            this.compact_level0_phase1(layer_removal_cs_inner, layers, target_file_size, &ctx_inner)
+        })
+        .await
+        .context("compact_level0_phase1 spawn_blocking")
+        .map_err(CompactionError::Other)
+        .and_then(|res| res)?;

        if new_layers.is_empty() && deltas_to_compact.is_empty() {
            // nothing to do
@@ -3589,7 +3691,7 @@ impl Timeline {
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
        for l in deltas_to_compact {
            layer_names_to_delete.push(l.filename());
-            self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
        }
        updates.flush();
        drop(layers);
@@ -3709,10 +3811,11 @@ impl Timeline {

        fail_point!("before-timeline-gc");

-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        let state = *self.state.borrow();
        if state == TimelineState::Stopping {
+            // there's a global allowed_error for this
            anyhow::bail!("timeline is Stopping");
        }

@@ -3729,7 +3832,7 @@ impl Timeline {

        let res = self
            .gc_timeline(
-                &layer_removal_cs,
+                layer_removal_cs.clone(),
                horizon_cutoff,
                pitr_cutoff,
                retain_lsns,
@@ -3748,7 +3851,7 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
        horizon_cutoff: Lsn,
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
@@ -3888,7 +3991,7 @@ impl Timeline {
                // delta layers. Image layers can form "stairs" preventing old image from been deleted.
                // But image layers are in any case less sparse than delta layers. Also we need some
                // protection from replacing recent image layers with new one after each GC iteration.
-                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
                    wanted_image_layers.add_range(l.get_key_range());
                }
                result.layers_not_updated += 1;
@@ -3921,7 +4024,11 @@ impl Timeline {
            {
                for doomed_layer in layers_to_remove {
                    layer_names_to_delete.push(doomed_layer.filename());
-                    self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
+                    self.delete_historic_layer(
+                        layer_removal_cs.clone(),
+                        doomed_layer,
+                        &mut updates,
+                    )?; // FIXME: schedule succeeded deletions before returning?
                    result.layers_removed += 1;
                }
            }
@@ -4093,7 +4200,7 @@ impl Timeline {
                // Does retries + exponential back-off internally.
                // When this fails, don't layer further retry attempts here.
                let result = remote_client
-                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
                    .await;

                if let Ok(size) = &result {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,6 +34,8 @@ use crate::{
    },
 };

+use utils::completion;
+
 use super::Timeline;

 #[derive(Default)]
@@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState {
 }

 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
+    pub(super) fn launch_eviction_task(
+        self: &Arc<Self>,
+        background_tasks_can_start: Option<&completion::Barrier>,
+    ) {
        let self_clone = Arc::clone(self);
+        let background_tasks_can_start = background_tasks_can_start.cloned();
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
@@ -57,7 +63,13 @@ impl Timeline {
            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
            false,
            async move {
-                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
+                };
+
+                self_clone.eviction_task(cancel).await;
                info!("eviction task finishing");
                Ok(())
            },
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -25,6 +25,7 @@ mod walreceiver_connection;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };
@@ -85,7 +86,8 @@ impl WalReceiver {
            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
            false,
            async move {
-                info!("WAL receiver manager started, connecting to broker");
+                debug_assert_current_span_has_tenant_and_timeline_id();
+                debug!("WAL receiver manager started, connecting to broker");
                let mut connection_manager_state = ConnectionManagerState::new(
                    timeline,
                    conf,
@@ -93,7 +95,7 @@ impl WalReceiver {
                loop {
                    select! {
                        _ = task_mgr::shutdown_watcher() => {
-                            info!("WAL receiver shutdown requested, shutting down");
+                            trace!("WAL receiver shutdown requested, shutting down");
                            break;
                        },
                        loop_step_result = connection_manager_loop_step(
@@ -104,7 +106,7 @@ impl WalReceiver {
                        ) => match loop_step_result {
                            ControlFlow::Continue(()) => continue,
                            ControlFlow::Break(()) => {
-                                info!("Connection manager loop ended, shutting down");
+                                trace!("Connection manager loop ended, shutting down");
                                break;
                            }
                        },
@@ -115,7 +117,7 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
        );

        Self {
@@ -198,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
                TaskEvent::End(match self.join_handle.as_mut() {
                    Some(jh) => {
                        if !jh.is_finished() {
-                            // Barring any implementation errors in this module, we can
-                            // only arrive here while the task that executes the future
-                            // passed to `Self::spawn()` is still execution. Cf the comment
-                            // in Self::spawn().
-                            //
-                            // This was logging at warning level in earlier versions, presumably
-                            // to leave some breadcrumbs in case we had an implementation
-                            // error that would would make us get stuck in `jh.await`.
-                            //
-                            // There hasn't been such a bug so far.
-                            // But in a busy system, e.g., during pageserver restart,
-                            // we arrive here often enough that the warning-level logs
-                            // became a distraction.
-                            // So, tone them down to info-level.
-                            //
-                            // XXX: rewrite this module to eliminate the race condition.
-                            info!("sender is dropped while join handle is still alive");
+                            // See: https://github.com/neondatabase/neon/issues/2885
+                            trace!("sender is dropped while join handle is still alive");
                        }

-                        let res = jh
-                            .await
-                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x);
+                        let res = match jh.await {
+                            Ok(res) => res,
+                            Err(je) if je.is_cancelled() => unreachable!("not used"),
+                            Err(je) if je.is_panic() => {
+                                // already logged
+                                Ok(())
+                            }
+                            Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
+                        };

                        // For cancellation-safety, drop join_handle only after successful .await.
                        self.join_handle = None;
@@ -243,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
            match jh.await {
                Ok(Ok(())) => debug!("Shutdown success"),
                Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(join_error) => {
-                    if join_error.is_cancelled() {
-                        error!("Shutdown task was cancelled");
-                    } else {
-                        error!("Shutdown task join error: {join_error}")
-                    }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => {
+                    // already logged
+                }
+                Err(je) => {
+                    error!("Shutdown task join error: {je}")
                }
            }
        }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -18,7 +18,7 @@ use crate::metrics::{
    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::Timeline;
+use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -55,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
        .await
    {
        Ok(()) => {}
-        Err(_) => {
-            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+        Err(new_state) => {
+            debug!(
+                ?new_state,
+                "state changed, stopping wal connection manager loop"
+            );
            return ControlFlow::Break(());
        }
    }
@@ -79,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
    // with other streams on this client (other connection managers). When
    // object goes out of scope, stream finishes in drop() automatically.
    let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
-    info!("Subscribed for broker timeline updates");
+    debug!("Subscribed for broker timeline updates");

    loop {
        let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -151,12 +154,12 @@ pub(super) async fn connection_manager_loop_step(
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
                                TimelineState::Broken | TimelineState::Stopping => {
-                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                    return ControlFlow::Break(());
                                }
                                TimelineState::Loading => {
                                    warn!("timeline transitioned back to Loading state, that should not happen");
-                                    return ControlFlow::Continue(new_state);
+                                    return ControlFlow::Continue(());
                                }
                            }
                        }
@@ -164,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
                    }
                }
            } => match new_event {
-                ControlFlow::Continue(new_state) => {
-                    info!("observed timeline state change, new state is {new_state:?}");
+                ControlFlow::Continue(()) => {
                    return ControlFlow::Continue(());
                }
                ControlFlow::Break(()) => {
-                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    debug!("Timeline is no longer active, stopping wal connection manager loop");
                    return ControlFlow::Break(());
                }
            },
@@ -390,7 +392,6 @@ impl ConnectionManagerState {

        self.drop_old_connection(true).await;

-        let id = self.id;
        let node_id = new_sk.safekeeper_id;
        let connect_timeout = self.conf.wal_connect_timeout;
        let timeline = Arc::clone(&self.timeline);
@@ -398,9 +399,13 @@ impl ConnectionManagerState {
            TaskKind::WalReceiverConnectionHandler,
            DownloadBehavior::Download,
        );
+
+        let span = info_span!("connection", %node_id);
        let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
            async move {
-                super::walreceiver_connection::handle_walreceiver_connection(
+                debug_assert_current_span_has_tenant_and_timeline_id();
+
+                let res = super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
@@ -409,12 +414,23 @@ impl ConnectionManagerState {
                    ctx,
                    node_id,
                )
-                .await
-                .context("walreceiver connection handling failure")
+                .await;
+
+                match res {
+                    Ok(()) => Ok(()),
+                    Err(e) => {
+                        use super::walreceiver_connection::ExpectedError;
+                        if e.is_expected() {
+                            info!("walreceiver connection handling ended: {e:#}");
+                            Ok(())
+                        } else {
+                            // give out an error to have task_mgr give it a really verbose logging
+                            Err(e).context("walreceiver connection handling failure")
+                        }
+                    }
+                }
            }
-            .instrument(
-                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
-            )
+            .instrument(span)
        });

        let now = Utc::now().naive_utc();
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace, warn, Instrument};

 use super::TaskStateUpdate;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
+    context::RequestContext,
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::WALRECEIVER_RUNTIME,
-    tenant::{Timeline, WalReceiverInfo},
+    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
 };
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
        config.application_name("pageserver");
        config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
        match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
-            Ok(Ok(client_and_conn)) => client_and_conn,
-            Ok(Err(conn_err)) => {
-                let expected_error = ignore_expected_errors(conn_err)?;
-                info!("DB connection stream finished: {expected_error}");
-                return Ok(());
-            }
-            Err(_) => {
+            Ok(client_and_conn) => client_and_conn?,
+            Err(_elapsed) => {
                // Timing out to connect to a safekeeper node could happen long time, due to
                // many reasons that pageserver cannot control.
                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
        }
    };

-    info!("connected!");
+    debug!("connected!");
    let mut connection_status = WalConnectionStatus {
        is_connected: true,
        has_processed_wal: false,
@@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection(
        "walreceiver connection",
        false,
        async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+
            select! {
                connection_result = connection => match connection_result {
-                    Ok(()) => info!("Walreceiver db connection closed"),
+                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if let Err(e) = ignore_expected_errors(connection_error) {
-                            warn!("Connection aborted: {e:#}")
+                        if connection_error.is_expected() {
+                            // silence, because most likely we've already exited the outer call
+                            // with a similar error.
+                        } else {
+                            warn!("Connection aborted: {connection_error:#}")
                        }
                    }
                },
-                // Future: replace connection_cancellation with connection_ctx cancellation
-                _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
+                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
            }
            Ok(())
-        },
+        }
+        .instrument(tracing::info_span!("poller")),
    );

    // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection(
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
-                info!("walreceiver interrupted");
+                debug!("walreceiver interrupted");
                None
            }
            replication_message = physical_stream.next() => replication_message,
        }
    } {
-        let replication_message = match replication_message {
-            Ok(message) => message,
-            Err(replication_error) => {
-                let expected_error = ignore_expected_errors(replication_error)?;
-                info!("Replication stream finished: {expected_error}");
-                return Ok(());
-            }
-        };
+        let replication_message = replication_message?;

        let now = Utc::now().naive_utc();
        let last_rec_lsn_before_msg = last_rec_lsn;
@@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection(
                    let mut decoded = DecodedWALRecord::default();
                    let mut modification = timeline.begin_modification(endlsn);
                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                        // let _enter = info_span!("processing record", lsn = %lsn).entered();
-
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
@@ -424,31 +415,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
    }
 }

-/// We don't want to report connectivity problems as real errors towards connection manager because
-/// 1. they happen frequently enough to make server logs hard to read and
-/// 2. the connection manager can retry other safekeeper.
-///
-/// If this function returns `Ok(pg_error)`, it's such an error.
-/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-/// Connection manager will then handle reconnections.
-///
-/// If this function returns an `Err()`, the caller can bubble it up using `?`.
-/// The connection manager will log the error at ERROR level.
-fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
-    if pg_error.is_closed()
-        || pg_error
-            .source()
-            .and_then(|source| source.downcast_ref::<std::io::Error>())
-            .map(is_expected_io_error)
-            .unwrap_or(false)
-    {
-        return Ok(pg_error);
-    } else if let Some(db_error) = pg_error.as_db_error() {
-        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-            && db_error.message().contains("ending streaming")
-        {
-            return Ok(pg_error);
-        }
-    }
-    Err(pg_error).context("connection error")
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
+pub(super) trait ExpectedError {
+    /// Test if this error is an ok error.
+    ///
+    /// We don't want to report connectivity problems as real errors towards connection manager because
+    /// 1. they happen frequently enough to make server logs hard to read and
+    /// 2. the connection manager can retry other safekeeper.
+    ///
+    /// If this function returns `true`, it's such an error.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// Connection manager will then handle reconnections.
+    ///
+    /// If this function returns an `false` the error should be propagated and the connection manager
+    /// will log the error at ERROR level.
+    fn is_expected(&self) -> bool;
+}
+
+impl ExpectedError for postgres::Error {
+    fn is_expected(&self) -> bool {
+        self.is_closed()
+            || self
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+            || self
+                .as_db_error()
+                .filter(|db_error| {
+                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                        && db_error.message().contains("ending streaming")
+                })
+                .is_some()
+    }
+}
+
+impl ExpectedError for anyhow::Error {
+    fn is_expected(&self) -> bool {
+        let head = self.downcast_ref::<postgres::Error>();
+
+        let tail = self
+            .chain()
+            .filter_map(|e| e.downcast_ref::<postgres::Error>());
+
+        // check if self or any of the chained/sourced errors are expected
+        head.into_iter().chain(tail).any(|e| e.is_expected())
+    }
 }
--- a/pgxn/hnsw/Makefile
+++ b/pgxn/hnsw/Makefile
@@ -0,0 +1,26 @@
+EXTENSION = hnsw
+EXTVERSION = 0.1.0
+
+MODULE_big = hnsw
+DATA = $(wildcard *--*.sql)
+OBJS = hnsw.o hnswalg.o
+
+TESTS = $(wildcard test/sql/*.sql)
+REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
+REGRESS_OPTS = --inputdir=test --load-extension=hnsw
+
+# For auto-vectorization:
+# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
+PG_CFLAGS += -O3
+PG_CXXFLAGS +=  -O3 -std=c++11
+PG_LDFLAGS += -lstdc++
+
+all: $(EXTENSION)--$(EXTVERSION).sql
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+
+dist:
+	mkdir -p dist
+	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
--- a/pgxn/hnsw/README.md
+++ b/pgxn/hnsw/README.md
@@ -0,0 +1,25 @@
+# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
+
+This ANN extension of Postgres is based
+on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
+the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
+
+[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
+<br>
+Dmitry Baranchuk, Artem Babenko, Yury Malkov
+
+# Postgres extension
+
+HNSW index is hold in memory (built on demand) and it's maxial size is limited
+by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
+Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
+described in the article).
+
+# Example of usage:
+
+```
+create extension hnsw;
+create table embeddings(id integer primary key, payload real[]);
+create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
+select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
+```
--- a/pgxn/hnsw/hnsw--0.1.0.sql
+++ b/pgxn/hnsw/hnsw--0.1.0.sql
@@ -0,0 +1,29 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
+
+-- functions
+
+CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
+	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
+
+-- operators
+
+CREATE OPERATOR <-> (
+	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
+	COMMUTATOR = '<->'
+);
+
+-- access method
+
+CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
+	AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
+
+COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
+
+-- opclasses
+
+CREATE OPERATOR CLASS knn_ops
+	DEFAULT FOR TYPE real[] USING hnsw AS
+	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -0,0 +1,551 @@
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/generic_xlog.h"
+#include "access/relation.h"
+#include "access/reloptions.h"
+#include "access/tableam.h"
+#include "catalog/index.h"
+#include "commands/vacuum.h"
+#include "nodes/execnodes.h"
+#include "storage/bufmgr.h"
+#include "utils/guc.h"
+#include "utils/selfuncs.h"
+
+#include <math.h>
+#include <float.h>
+
+#include "hnsw.h"
+
+PG_MODULE_MAGIC;
+
+typedef struct {
+	int32 vl_len_;		/* varlena header (do not touch directly!) */
+	int dims;
+	int maxelements;
+	int efConstruction;
+	int efSearch;
+	int M;
+} HnswOptions;
+
+static relopt_kind hnsw_relopt_kind;
+
+typedef struct {
+	HierarchicalNSW* hnsw;
+	size_t curr;
+	size_t n_results;
+	ItemPointer results;
+} HnswScanOpaqueData;
+
+typedef HnswScanOpaqueData* HnswScanOpaque;
+
+typedef struct {
+	Oid relid;
+	uint32 status;
+	HierarchicalNSW* hnsw;
+} HnswHashEntry;
+
+
+#define SH_PREFIX			 hnsw_index
+#define SH_ELEMENT_TYPE		 HnswHashEntry
+#define SH_KEY_TYPE			 Oid
+#define SH_KEY				 relid
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a)	 ((a)->relid)
+#define SH_HASH_KEY(tb, key) (key)
+#define SH_EQUAL(tb, a, b)	((a) == (b))
+#define SH_SCOPE			static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+#define INDEX_HASH_SIZE     11
+
+#define DEFAULT_EF_SEARCH   64
+
+PGDLLEXPORT void _PG_init(void);
+
+static hnsw_index_hash *hnsw_indexes;
+
+/*
+ * Initialize index options and variables
+ */
+void
+_PG_init(void)
+{
+	hnsw_relopt_kind = add_reloption_kind();
+	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
+					  0, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
+					  0, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
+					  100, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
+					  16, 1, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
+					  64, 1, INT_MAX, AccessExclusiveLock);
+	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
+}
+
+
+static void
+hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
+					bool *isnull, bool tupleIsAlive, void *state)
+{
+	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
+	ArrayType* array;
+	int n_items;
+	label_t label = 0;
+
+	/* Skip nulls */
+	if (isnull[0])
+		return;
+
+	array = DatumGetArrayTypeP(values[0]);
+	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+	if (n_items != hnsw_dimensions(hnsw))
+	{
+		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+			 n_items, hnsw_dimensions(hnsw));
+	}
+
+	memcpy(&label, tid, sizeof(*tid));
+	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
+}
+
+static void
+hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
+{
+	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
+	Assert(indexInfo->ii_NumIndexAttrs == 1);
+	table_index_build_scan(heapRel, indexRel, indexInfo,
+						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
+}
+
+static HierarchicalNSW*
+hnsw_get_index(Relation indexRel, Relation heapRel)
+{
+	HierarchicalNSW* hnsw;
+	Oid indexoid = RelationGetRelid(indexRel);
+	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
+	if (entry == NULL)
+	{
+		size_t dims, maxelements;
+		size_t M;
+		size_t maxM;
+		size_t size_links_level0;
+		size_t size_data_per_element;
+		size_t data_size;
+		dsm_handle handle = indexoid << 1; /* make it even */
+		void* impl_private = NULL;
+		void* mapped_address = NULL;
+		Size  mapped_size = 0;
+		Size  shmem_size;
+		bool exists = true;
+		bool found;
+		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
+		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
+			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
+		}
+		dims = opts->dims;
+		maxelements = opts->maxelements;
+		M = opts->M;
+		maxM = M * 2;
+		data_size = dims * sizeof(coord_t);
+		size_links_level0 = (maxM + 1) * sizeof(idx_t);
+		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
+		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
+
+		/* first try to attach to existed index */
+		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
+						 &mapped_address, &mapped_size, DEBUG1))
+		{
+			/* index doesn't exists: try to create it */
+			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
+							 &mapped_address, &mapped_size, DEBUG1))
+			{
+				/* We can do it under shared lock, so some other backend may
+				 * try to initialize index. If create is failed because index already
+				 * created by somebody else, then try to attach to it once again
+				 */
+				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
+								 &mapped_address, &mapped_size, ERROR))
+				{
+					return NULL;
+				}
+			}
+			else
+			{
+				exists = false;
+			}
+		}
+		Assert(mapped_size == shmem_size);
+		hnsw = (HierarchicalNSW*)mapped_address;
+
+		if (!exists)
+		{
+			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
+			hnsw_populate(hnsw, indexRel, heapRel);
+		}
+		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
+		Assert(!found);
+		entry->hnsw = hnsw;
+	}
+	else
+	{
+		hnsw = entry->hnsw;
+	}
+	return hnsw;
+}
+
+/*
+ * Start or restart an index scan
+ */
+static IndexScanDesc
+hnsw_beginscan(Relation index, int nkeys, int norderbys)
+{
+	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
+	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
+	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
+	so->hnsw = hnsw_get_index(index, heap);
+	relation_close(heap, NoLock);
+	so->curr = 0;
+	so->n_results = 0;
+	so->results = NULL;
+	scan->opaque = so;
+	return scan;
+}
+
+/*
+ * Start or restart an index scan
+ */
+static void
+hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+	if (so->results)
+	{
+		pfree(so->results);
+		so->results = NULL;
+	}
+	so->curr = 0;
+	if (orderbys && scan->numberOfOrderBys > 0)
+		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
+}
+
+/*
+ * Fetch the next tuple in the given scan
+ */
+static bool
+hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+
+	/*
+	 * Index can be used to scan backward, but Postgres doesn't support
+	 * backward scan on operators
+	 */
+	Assert(ScanDirectionIsForward(dir));
+
+	if (so->curr == 0)
+	{
+		Datum		value;
+		ArrayType*	array;
+		int         n_items;
+		size_t      n_results;
+		label_t*    results;
+		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
+		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
+
+		/* Safety check */
+		if (scan->orderByData == NULL)
+			elog(ERROR, "cannot scan HNSW index without order");
+
+		/* No items will match if null */
+		if (scan->orderByData->sk_flags & SK_ISNULL)
+			return false;
+
+		value = scan->orderByData->sk_argument;
+		array = DatumGetArrayTypeP(value);
+		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+		if (n_items != hnsw_dimensions(so->hnsw))
+		{
+			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+				 n_items, hnsw_dimensions(so->hnsw));
+		}
+
+		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
+			elog(ERROR, "HNSW index search failed");
+		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
+		so->n_results = n_results;
+		for (size_t i = 0; i < n_results; i++)
+		{
+			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
+		}
+		free(results);
+	}
+	if (so->curr >= so->n_results)
+	{
+		return false;
+	}
+	else
+	{
+		scan->xs_heaptid = so->results[so->curr++];
+		scan->xs_recheckorderby = false;
+		return true;
+	}
+}
+
+/*
+ * End a scan and release resources
+ */
+static void
+hnsw_endscan(IndexScanDesc scan)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+	if (so->results)
+		pfree(so->results);
+	pfree(so);
+	scan->opaque = NULL;
+}
+
+
+/*
+ * Estimate the cost of an index scan
+ */
+static void
+hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+				 Cost *indexStartupCost, Cost *indexTotalCost,
+				 Selectivity *indexSelectivity, double *indexCorrelation
+				 ,double *indexPages
+)
+{
+	GenericCosts costs;
+
+	/* Never use index without order */
+	if (path->indexorderbys == NULL)
+	{
+		*indexStartupCost = DBL_MAX;
+		*indexTotalCost = DBL_MAX;
+		*indexSelectivity = 0;
+		*indexCorrelation = 0;
+		*indexPages = 0;
+		return;
+	}
+
+	MemSet(&costs, 0, sizeof(costs));
+
+	genericcostestimate(root, path, loop_count, &costs);
+
+	/* Startup cost and total cost are same */
+	*indexStartupCost = costs.indexTotalCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
+	*indexPages = costs.numIndexPages;
+}
+
+/*
+ * Parse and validate the reloptions
+ */
+static bytea *
+hnsw_options(Datum reloptions, bool validate)
+{
+	static const relopt_parse_elt tab[] = {
+		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
+		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
+		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
+		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
+		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
+	};
+
+	return (bytea *) build_reloptions(reloptions, validate,
+									  hnsw_relopt_kind,
+									  sizeof(HnswOptions),
+									  tab, lengthof(tab));
+}
+
+/*
+ * Validate catalog entries for the specified operator class
+ */
+static bool
+hnsw_validate(Oid opclassoid)
+{
+	return true;
+}
+
+/*
+ * Build the index for a logged table
+ */
+static IndexBuildResult *
+hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
+	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
+
+	return result;
+}
+
+/*
+ * Insert a tuple into the index
+ */
+static bool
+hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
+			  Relation heap, IndexUniqueCheck checkUnique,
+			  bool indexUnchanged,
+			  IndexInfo *indexInfo)
+{
+	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
+	Datum value;
+	ArrayType* array;
+	int n_items;
+	label_t label = 0;
+
+	/* Skip nulls */
+	if (isnull[0])
+		return false;
+
+	/* Detoast value */
+	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
+	array = DatumGetArrayTypeP(value);
+	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+	if (n_items != hnsw_dimensions(hnsw))
+	{
+		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+			 n_items, hnsw_dimensions(hnsw));
+	}
+	memcpy(&label, heap_tid, sizeof(*heap_tid));
+	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
+		elog(ERROR, "HNSW index insert failed");
+	return true;
+}
+
+/*
+ * Build the index for an unlogged table
+ */
+static void
+hnsw_buildempty(Relation index)
+{
+	/* index will be constructed on dema nd when accessed */
+}
+
+/*
+ * Clean up after a VACUUM operation
+ */
+static IndexBulkDeleteResult *
+hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+	Relation	rel = info->index;
+
+	if (stats == NULL)
+		return NULL;
+
+	stats->num_pages = RelationGetNumberOfBlocks(rel);
+
+	return stats;
+}
+
+/*
+ * Bulk delete tuples from the index
+ */
+static IndexBulkDeleteResult *
+hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+				IndexBulkDeleteCallback callback, void *callback_state)
+{
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	return stats;
+}
+
+/*
+ * Define index handler
+ *
+ * See https://www.postgresql.org/docs/current/index-api.html
+ */
+PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
+Datum
+hnsw_handler(PG_FUNCTION_ARGS)
+{
+	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+	amroutine->amstrategies = 0;
+	amroutine->amsupport = 0;
+	amroutine->amoptsprocnum = 0;
+	amroutine->amcanorder = false;
+	amroutine->amcanorderbyop = true;
+	amroutine->amcanbackward = false;	/* can change direction mid-scan */
+	amroutine->amcanunique = false;
+	amroutine->amcanmulticol = false;
+	amroutine->amoptionalkey = true;
+	amroutine->amsearcharray = false;
+	amroutine->amsearchnulls = false;
+	amroutine->amstorage = false;
+	amroutine->amclusterable = false;
+	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
+	amroutine->amcaninclude = false;
+	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
+	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
+	amroutine->amkeytype = InvalidOid;
+
+	/* Interface functions */
+	amroutine->ambuild = hnsw_build;
+	amroutine->ambuildempty = hnsw_buildempty;
+	amroutine->aminsert = hnsw_insert;
+	amroutine->ambulkdelete = hnsw_bulkdelete;
+	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
+	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
+	amroutine->amcostestimate = hnsw_costestimate;
+	amroutine->amoptions = hnsw_options;
+	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
+	amroutine->ambuildphasename = NULL;
+	amroutine->amvalidate = hnsw_validate;
+	amroutine->amadjustmembers = NULL;
+	amroutine->ambeginscan = hnsw_beginscan;
+	amroutine->amrescan = hnsw_rescan;
+	amroutine->amgettuple = hnsw_gettuple;
+	amroutine->amgetbitmap = NULL;
+	amroutine->amendscan = hnsw_endscan;
+	amroutine->ammarkpos = NULL;
+	amroutine->amrestrpos = NULL;
+
+	/* Interface functions to support parallel index scans */
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
+
+	PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * Get the L2 distance between vectors
+ */
+PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
+Datum
+l2_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
+	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
+	dist_t 		distance = 0.0;
+	dist_t		diff;
+	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
+	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
+
+	if (a_dim != b_dim)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_EXCEPTION),
+				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
+	}
+
+	for (int i = 0; i < a_dim; i++)
+	{
+		diff = ax[i] - bx[i];
+		distance += diff * diff;
+	}
+
+	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
+}
--- a/pgxn/hnsw/hnsw.control
+++ b/pgxn/hnsw/hnsw.control
@@ -0,0 +1,5 @@
+comment = 'hNsw index'
+default_version = '0.1.0'
+module_pathname = '$libdir/hnsw'
+relocatable = true
+trusted = true
--- a/pgxn/hnsw/hnsw.h
+++ b/pgxn/hnsw/hnsw.h
@@ -0,0 +1,15 @@
+#pragma once
+
+typedef float    coord_t;
+typedef float    dist_t;
+typedef uint32_t idx_t;
+typedef uint64_t label_t;
+
+typedef struct HierarchicalNSW HierarchicalNSW;
+
+bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
+bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
+void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
+int  hnsw_dimensions(HierarchicalNSW* hnsw);
+size_t hnsw_count(HierarchicalNSW* hnsw);
+size_t hnsw_sizeof(void);
--- a/pgxn/hnsw/hnswalg.cpp
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -0,0 +1,379 @@
+#include "hnswalg.h"
+
+#if defined(__GNUC__)
+#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
+#else
+#define PORTABLE_ALIGN32 __declspec(align(32))
+#define PREFETCH(addr,hint)
+#endif
+
+HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
+{
+    dim = dim_;
+    data_size = dim * sizeof(coord_t);
+
+    efConstruction = efConstruction_;
+
+    maxelements = maxelements_;
+    M = M_;
+    maxM = maxM_;
+    size_links_level0 = (maxM + 1) * sizeof(idx_t);
+    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
+    offset_data = size_links_level0;
+	offset_label = offset_data + data_size;
+
+    enterpoint_node = 0;
+    cur_element_count = 0;
+#ifdef __x86_64__
+    use_avx2 = __builtin_cpu_supports("avx2");
+#endif
+}
+
+std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
+{
+	std::vector<uint32_t> visited;
+	visited.resize((cur_element_count + 31) >> 5);
+
+    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
+    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
+
+    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
+
+    topResults.emplace(dist, enterpoint_node);
+    candidateSet.emplace(-dist, enterpoint_node);
+    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
+    dist_t lowerBound = dist;
+
+    while (!candidateSet.empty())
+    {
+        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
+        if (-curr_el_pair.first > lowerBound)
+            break;
+
+        candidateSet.pop();
+        idx_t curNodeNum = curr_el_pair.second;
+
+        idx_t* data = get_linklist0(curNodeNum);
+        size_t size = *data++;
+
+        PREFETCH(getDataByInternalId(*data), 0);
+
+        for (size_t j = 0; j < size; ++j) {
+            size_t tnum = *(data + j);
+
+            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
+
+            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
+				visited[tnum >> 5] |= 1 << (tnum & 31);
+
+                dist = fstdistfunc(point, getDataByInternalId(tnum));
+
+                if (topResults.top().first > dist || topResults.size() < ef) {
+                    candidateSet.emplace(-dist, tnum);
+
+                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
+                    topResults.emplace(dist, tnum);
+
+                    if (topResults.size() > ef)
+                        topResults.pop();
+
+                    lowerBound = topResults.top().first;
+                }
+            }
+        }
+    }
+    return topResults;
+}
+
+
+void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
+{
+    if (topResults.size() < NN)
+        return;
+
+    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
+    std::vector<std::pair<dist_t, idx_t>> returnlist;
+
+    while (topResults.size() > 0) {
+        resultSet.emplace(-topResults.top().first, topResults.top().second);
+        topResults.pop();
+    }
+
+    while (resultSet.size()) {
+        if (returnlist.size() >= NN)
+            break;
+        std::pair<dist_t, idx_t> curen = resultSet.top();
+        dist_t dist_to_query = -curen.first;
+        resultSet.pop();
+        bool good = true;
+        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
+            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
+                                         getDataByInternalId(curen.second));
+            if (curdist < dist_to_query) {
+                good = false;
+                break;
+            }
+        }
+        if (good) returnlist.push_back(curen);
+    }
+    for (std::pair<dist_t, idx_t> elem : returnlist)
+        topResults.emplace(-elem.first, elem.second);
+}
+
+void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
+                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
+{
+    getNeighborsByHeuristic(topResults, M);
+
+    std::vector<idx_t> res;
+    res.reserve(M);
+    while (topResults.size() > 0) {
+        res.push_back(topResults.top().second);
+        topResults.pop();
+    }
+    {
+        idx_t* data = get_linklist0(cur_c);
+        if (*data)
+            throw std::runtime_error("Should be blank");
+
+        *data++ = res.size();
+
+        for (size_t idx = 0; idx < res.size(); idx++) {
+            if (data[idx])
+                throw std::runtime_error("Should be blank");
+            data[idx] = res[idx];
+        }
+    }
+    for (size_t idx = 0; idx < res.size(); idx++) {
+        if (res[idx] == cur_c)
+            throw std::runtime_error("Connection to the same element");
+
+        size_t resMmax = maxM;
+        idx_t *ll_other = get_linklist0(res[idx]);
+        idx_t sz_link_list_other = *ll_other;
+
+        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
+            throw std::runtime_error("Bad sz_link_list_other");
+
+        if (sz_link_list_other < resMmax) {
+            idx_t *data = ll_other + 1;
+            data[sz_link_list_other] = cur_c;
+            *ll_other = sz_link_list_other + 1;
+        } else {
+            // finding the "weakest" element to replace it with the new one
+            idx_t *data = ll_other + 1;
+            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
+            // Heuristic:
+            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
+            candidates.emplace(d_max, cur_c);
+
+            for (size_t j = 0; j < sz_link_list_other; j++)
+                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
+
+            getNeighborsByHeuristic(candidates, resMmax);
+
+            size_t indx = 0;
+            while (!candidates.empty()) {
+                data[indx] = candidates.top().second;
+                candidates.pop();
+                indx++;
+            }
+            *ll_other = indx;
+        }
+    }
+}
+
+void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
+{
+    if (cur_element_count >= maxelements) {
+        throw std::runtime_error("The number of elements exceeds the specified limit");
+    }
+    idx_t cur_c = cur_element_count++;
+    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
+    memcpy(getDataByInternalId(cur_c), point, data_size);
+    memcpy(getExternalLabel(cur_c), &label, sizeof label);
+
+    // Do nothing for the first element
+    if (cur_c != 0) {
+        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
+        mutuallyConnectNewElement(point, cur_c, topResults);
+    }
+};
+
+std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
+{
+	std::priority_queue<std::pair<dist_t, label_t>> topResults;
+	auto topCandidates = searchBaseLayer(query, k);
+    while (topCandidates.size() > k) {
+        topCandidates.pop();
+	}
+	while (!topCandidates.empty()) {
+		std::pair<dist_t, idx_t> rez = topCandidates.top();
+		label_t label;
+		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
+		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
+		topCandidates.pop();
+	}
+
+    return topResults;
+};
+
+dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
+{
+    dist_t 	distance = 0.0;
+
+    for (size_t i = 0; i < n; i++)
+    {
+        dist_t diff = x[i] - y[i];
+        distance += diff * diff;
+    }
+    return distance;
+
+}
+
+#ifdef __x86_64__
+#include <immintrin.h>
+
+__attribute__((target("avx2")))
+dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
+{
+    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
+    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
+    size_t qty16 = n / 16;
+    const float *pEnd1 = x + (qty16 * 16);
+    __m256 diff, v1, v2;
+    __m256 sum = _mm256_set1_ps(0);
+
+    while (x < pEnd1) {
+        v1 = _mm256_loadu_ps(x);
+        x += 8;
+        v2 = _mm256_loadu_ps(y);
+        y += 8;
+        diff = _mm256_sub_ps(v1, v2);
+        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+
+        v1 = _mm256_loadu_ps(x);
+        x += 8;
+        v2 = _mm256_loadu_ps(y);
+        y += 8;
+        diff = _mm256_sub_ps(v1, v2);
+        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+    }
+    _mm256_store_ps(TmpRes, sum);
+    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+    return (res);
+}
+
+dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
+{
+    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
+    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
+    size_t qty16 = n / 16;
+    const float *pEnd1 = x + (qty16 * 16);
+
+    __m128 diff, v1, v2;
+    __m128 sum = _mm_set1_ps(0);
+
+    while (x < pEnd1) {
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+    }
+    _mm_store_ps(TmpRes, sum);
+    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    return res;
+}
+#endif
+
+dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
+{
+#ifndef __x86_64__
+    return fstdistfunc_scalar(x, y, dim);
+#else
+    if(use_avx2)
+        return fstdistfunc_avx2(x, y, dim);
+
+    return fstdistfunc_sse(x, y, dim);
+#endif
+}
+
+bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
+{
+	try
+	{
+		auto result = hnsw->searchKnn(point, efSearch);
+		size_t nResults = result.size();
+		*results = (label_t*)malloc(nResults*sizeof(label_t));
+		for (size_t i = nResults; i-- != 0;)
+		{
+			(*results)[i] = result.top().second;
+			result.pop();
+		}
+		*n_results = nResults;
+		return true;
+	}
+	catch (std::exception& x)
+	{
+		return false;
+	}
+}
+
+bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
+{
+	try
+	{
+		hnsw->addPoint(point, label);
+		return true;
+	}
+	catch (std::exception& x)
+	{
+		fprintf(stderr, "Catch %s\n", x.what());
+		return false;
+	}
+}
+
+void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
+{
+	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
+}
+
+
+int hnsw_dimensions(HierarchicalNSW* hnsw)
+{
+	return (int)hnsw->dim;
+}
+
+size_t hnsw_count(HierarchicalNSW* hnsw)
+{
+	return hnsw->cur_element_count;
+}
+
+size_t hnsw_sizeof(void)
+{
+	return sizeof(HierarchicalNSW);
+}
--- a/pgxn/hnsw/hnswalg.h
+++ b/pgxn/hnsw/hnswalg.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <map>
+#include <cmath>
+#include <queue>
+#include <stdexcept>
+
+extern "C" {
+#include "hnsw.h"
+}
+
+struct HierarchicalNSW
+{
+	size_t maxelements;
+	size_t cur_element_count;
+
+	idx_t  enterpoint_node;
+
+	size_t dim;
+	size_t data_size;
+	size_t offset_data;
+	size_t offset_label;
+	size_t size_data_per_element;
+	size_t M;
+	size_t maxM;
+	size_t size_links_level0;
+	size_t efConstruction;
+
+#ifdef __x86_64__
+	bool	use_avx2;
+#endif
+
+	char   data_level0_memory[0]; // varying size
+
+  public:
+	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
+	~HierarchicalNSW();
+
+
+	inline coord_t *getDataByInternalId(idx_t internal_id) const {
+		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
+	}
+
+	inline idx_t *get_linklist0(idx_t internal_id) const {
+		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
+	}
+
+	inline label_t *getExternalLabel(idx_t internal_id) const {
+		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
+	}
+
+	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
+
+	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
+
+	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
+
+	void addPoint(const coord_t *point, label_t label);
+
+	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
+
+	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
+};
--- a/pgxn/hnsw/test/expected/knn.out
+++ b/pgxn/hnsw/test/expected/knn.out
@@ -0,0 +1,28 @@
+SET enable_seqscan = off;
+CREATE TABLE t (val real[]);
+INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
+CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
+INSERT INTO t (val) VALUES (array[1,2,4]);
+explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
+   Order By: (val <-> '{3,3,3}'::real[])
+(2 rows)
+
+SELECT * FROM t ORDER BY val <-> array[3,3,3];
+   val   
+---------
+ {1,2,3}
+ {1,2,4}
+ {1,1,1}
+ {0,0,0}
+(4 rows)
+
+SELECT COUNT(*) FROM t;
+ count 
+-------
+     5
+(1 row)
+
+DROP TABLE t;
--- a/pgxn/hnsw/test/sql/knn.sql
+++ b/pgxn/hnsw/test/sql/knn.sql
@@ -0,0 +1,13 @@
+SET enable_seqscan = off;
+
+CREATE TABLE t (val real[]);
+INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
+CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
+
+INSERT INTO t (val) VALUES (array[1,2,4]);
+
+explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
+SELECT * FROM t ORDER BY val <-> array[3,3,3];
+SELECT COUNT(*) FROM t;
+
+DROP TABLE t;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -11,10 +11,12 @@ OBJS = \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o
+	walproposer_utils.o \
+	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = -lcurl

 EXTENSION = neon
 DATA = neon--1.0.sql
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * control_plane_connector.c
+ *	  Captures updates to roles/databases using ProcessUtility_hook and
+ *        sends them to the control ProcessUtility_hook. The changes are sent
+ *        via HTTP to the URL specified by the GUC neon.console_url when the
+ *        transaction commits. Forwarding may be disabled temporarily by
+ *        setting neon.forward_ddl to false.
+ *
+ *        Currently, the transaction may abort AFTER
+ *        changes have already been forwarded, and that case is not handled.
+ *        Subtransactions are handled using a stack of hash tables, which
+ *        accumulate changes. On subtransaction commit, the top of the stack
+ *        is merged with the table below it.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/control_plane_connector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include <curl/curl.h>
+#include "utils/jsonb.h"
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+/* GUCs */
+static char *ConsoleURL = NULL;
+static bool ForwardDDL = true;
+
+/* Curl structures for sending the HTTP requests */
+static CURL * CurlHandle;
+static struct curl_slist *ContentHeader = NULL;
+
+/*
+ * CURL docs say that this buffer must exist until we call curl_easy_cleanup
+ * (which we never do), so we make this a static
+ */
+static char CurlErrorBuf[CURL_ERROR_SIZE];
+
+typedef enum
+{
+	Op_Set,						/* An upsert: Either a creation or an alter */
+	Op_Delete,
+}			OpType;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	Oid			owner;
+	char		old_name[NAMEDATALEN];
+	OpType		type;
+}			DbEntry;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	char		old_name[NAMEDATALEN];
+	const char *password;
+	OpType		type;
+}			RoleEntry;
+
+/*
+ * We keep one of these for each subtransaction in a stack. When a subtransaction
+ * commits, we merge the top of the stack into the table below it. It is allocated in the
+ * subtransaction's context.
+ */
+typedef struct DdlHashTable
+{
+	struct DdlHashTable *prev_table;
+	HTAB	   *db_table;
+	HTAB	   *role_table;
+}			DdlHashTable;
+
+static DdlHashTable RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;
+
+static void
+PushKeyValue(JsonbParseState **state, char *key, char *value)
+{
+	JsonbValue	k,
+				v;
+
+	k.type = jbvString;
+	k.val.string.len = strlen(key);
+	k.val.string.val = key;
+	v.type = jbvString;
+	v.val.string.len = strlen(value);
+	v.val.string.val = value;
+	pushJsonbValue(state, WJB_KEY, &k);
+	pushJsonbValue(state, WJB_VALUE, &v);
+}
+
+static char *
+ConstructDeltaMessage()
+{
+	JsonbParseState *state = NULL;
+
+	pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+	if (RootTable.db_table)
+	{
+		JsonbValue	dbs;
+
+		dbs.type = jbvString;
+		dbs.val.string.val = "dbs";
+		dbs.val.string.len = strlen(dbs.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &dbs);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
+
+		hash_seq_init(&status, RootTable.db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->owner != InvalidOid)
+			{
+				PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false));
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+
+	if (RootTable.role_table)
+	{
+		JsonbValue	roles;
+
+		roles.type = jbvString;
+		roles.val.string.val = "roles";
+		roles.val.string.len = strlen(roles.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &roles);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		RoleEntry  *entry;
+
+		hash_seq_init(&status, RootTable.role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->password)
+			{
+				PushKeyValue(&state, "password", (char *) entry->password);
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+	JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+	Jsonb	   *jsonb = JsonbValueToJsonb(result);
+
+	return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
+}
+
+#define ERROR_SIZE 1024
+
+typedef struct
+{
+	char		str[ERROR_SIZE];
+	size_t		size;
+}			ErrorString;
+
+static size_t
+ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+	/* Docs say size is always 1 */
+	ErrorString *str = userdata;
+
+	size_t		to_write = nmemb;
+
+	/* +1 for null terminator */
+	if (str->size + nmemb + 1 >= ERROR_SIZE)
+		to_write = ERROR_SIZE - str->size - 1;
+
+	/* Ignore everyrthing past the first ERROR_SIZE bytes */
+	if (to_write == 0)
+		return nmemb;
+	memcpy(str->str + str->size, ptr, to_write);
+	str->size += to_write;
+	str->str[str->size] = '\0';
+	return nmemb;
+}
+
+static void
+SendDeltasToControlPlane()
+{
+	if (!RootTable.db_table && !RootTable.role_table)
+		return;
+	if (!ConsoleURL)
+	{
+		elog(LOG, "ConsoleURL not set, skipping forwarding");
+		return;
+	}
+	if (!ForwardDDL)
+		return;
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str = {};
+
+	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
+	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
+	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
+	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+
+	const int	num_retries = 5;
+	int			curl_status;
+
+	for (int i = 0; i < num_retries; i++)
+	{
+		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+			break;
+		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
+		pg_usleep(1000 * 1000);
+	}
+	if (curl_status != 0)
+	{
+		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
+	}
+	else
+	{
+		long		response_code;
+
+		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		{
+			bool		error_exists = str.size != 0;
+
+			if (response_code != 200)
+			{
+				if (error_exists)
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane: %s",
+						 response_code,
+						 str.str);
+				}
+				else
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane",
+						 response_code);
+				}
+			}
+		}
+	}
+}
+
+static void
+InitDbTableIfNeeded()
+{
+	if (!CurrentDdlTable->db_table)
+	{
+		HASHCTL		db_ctl = {};
+
+		db_ctl.keysize = NAMEDATALEN;
+		db_ctl.entrysize = sizeof(DbEntry);
+		db_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->db_table = hash_create(
+												"Dbs Created",
+												4,
+												&db_ctl,
+												HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+InitRoleTableIfNeeded()
+{
+	if (!CurrentDdlTable->role_table)
+	{
+		HASHCTL		role_ctl = {};
+
+		role_ctl.keysize = NAMEDATALEN;
+		role_ctl.entrysize = sizeof(RoleEntry);
+		role_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->role_table = hash_create(
+												  "Roles Created",
+												  4,
+												  &role_ctl,
+												  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+PushTable()
+{
+	DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
+
+	new_table->prev_table = CurrentDdlTable;
+	new_table->role_table = NULL;
+	new_table->db_table = NULL;
+	CurrentDdlTable = new_table;
+}
+
+static void
+MergeTable()
+{
+	DdlHashTable *old_table = CurrentDdlTable;
+
+	CurrentDdlTable = old_table->prev_table;
+
+	if (old_table->db_table)
+	{
+		InitDbTableIfNeeded();
+		DbEntry    *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			DbEntry    *to_write = hash_search(
+											   CurrentDdlTable->db_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->owner != InvalidOid)
+				to_write->owner = entry->owner;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				DbEntry    *old = hash_search(
+											  CurrentDdlTable->db_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(
+								CurrentDdlTable->db_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->db_table);
+	}
+
+	if (old_table->role_table)
+	{
+		InitRoleTableIfNeeded();
+		RoleEntry  *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			RoleEntry  *to_write = hash_search(
+											   CurrentDdlTable->role_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->password)
+				to_write->password = entry->password;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				RoleEntry  *old = hash_search(
+											  CurrentDdlTable->role_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(CurrentDdlTable->role_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->role_table);
+	}
+}
+
+static void
+PopTable()
+{
+	/*
+	 * Current table gets freed because it is allocated in aborted
+	 * subtransaction's memory context.
+	 */
+	CurrentDdlTable = CurrentDdlTable->prev_table;
+}
+
+static void
+NeonSubXactCallback(
+					SubXactEvent event,
+					SubTransactionId mySubid,
+					SubTransactionId parentSubid,
+					void *arg)
+{
+	switch (event)
+	{
+		case SUBXACT_EVENT_START_SUB:
+			return PushTable();
+		case SUBXACT_EVENT_COMMIT_SUB:
+			return MergeTable();
+		case SUBXACT_EVENT_ABORT_SUB:
+			return PopTable();
+		default:
+			return;
+	}
+}
+
+static void
+NeonXactCallback(XactEvent event, void *arg)
+{
+	if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT)
+	{
+		SendDeltasToControlPlane();
+	}
+	RootTable.role_table = NULL;
+	RootTable.db_table = NULL;
+	Assert(CurrentDdlTable == &RootTable);
+}
+
+static void
+HandleCreateDb(CreatedbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	DefElem    *downer = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "owner") == 0)
+			downer = defel;
+	}
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->type = Op_Set;
+	if (downer && downer->arg)
+		entry->owner = get_role_oid(defGetString(downer), false);
+	else
+		entry->owner = GetUserId();
+}
+
+static void
+HandleAlterOwner(AlterOwnerStmt *stmt)
+{
+	if (stmt->objectType != OBJECT_DATABASE)
+		return;
+	InitDbTableIfNeeded();
+	const char *name = strVal(stmt->object);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									name,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
+	entry->type = Op_Set;
+}
+
+static void
+HandleDbRename(RenameStmt *stmt)
+{
+	Assert(stmt->renameType == OBJECT_DATABASE);
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+	DbEntry    *entry_for_new_name = hash_search(
+												 CurrentDdlTable->db_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->owner = entry->owner;
+		hash_search(
+					CurrentDdlTable->db_table,
+					stmt->subname,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->owner = InvalidOid;
+	}
+}
+
+static void
+HandleDropDb(DropdbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	entry->type = Op_Delete;
+	entry->owner = InvalidOid;
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+}
+
+static void
+HandleCreateRole(CreateRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role,
+									HASH_ENTER,
+									&found);
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass && dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleAlterRole(AlterRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	/* We only care about updates to the password */
+	if (!dpass)
+		return;
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role->rolename,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleRoleRename(RenameStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	Assert(stmt->renameType == OBJECT_ROLE);
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+
+	RoleEntry  *entry_for_new_name = hash_search(
+												 CurrentDdlTable->role_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->password = entry->password;
+		hash_search(
+					CurrentDdlTable->role_table,
+					entry->name,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->password = NULL;
+	}
+}
+
+static void
+HandleDropRole(DropRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	ListCell   *item;
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *spec = lfirst(item);
+		bool		found = false;
+		RoleEntry  *entry = hash_search(
+										CurrentDdlTable->role_table,
+										spec->rolename,
+										HASH_ENTER,
+										&found);
+
+		entry->type = Op_Delete;
+		entry->password = NULL;
+		if (!found)
+			memset(entry->old_name, 0, sizeof(entry));
+	}
+}
+
+static void
+HandleRename(RenameStmt *stmt)
+{
+	if (stmt->renameType == OBJECT_DATABASE)
+		return HandleDbRename(stmt);
+	else if (stmt->renameType == OBJECT_ROLE)
+		return HandleRoleRename(stmt);
+}
+
+static void
+NeonProcessUtility(
+				   PlannedStmt *pstmt,
+				   const char *queryString,
+				   bool readOnlyTree,
+				   ProcessUtilityContext context,
+				   ParamListInfo params,
+				   QueryEnvironment *queryEnv,
+				   DestReceiver *dest,
+				   QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreatedbStmt:
+			HandleCreateDb(castNode(CreatedbStmt, parseTree));
+			break;
+		case T_AlterOwnerStmt:
+			HandleAlterOwner(castNode(AlterOwnerStmt, parseTree));
+			break;
+		case T_RenameStmt:
+			HandleRename(castNode(RenameStmt, parseTree));
+			break;
+		case T_DropdbStmt:
+			HandleDropDb(castNode(DropdbStmt, parseTree));
+			break;
+		case T_CreateRoleStmt:
+			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
+			break;
+		case T_AlterRoleStmt:
+			HandleAlterRole(castNode(AlterRoleStmt, parseTree));
+			break;
+		case T_DropRoleStmt:
+			HandleDropRole(castNode(DropRoleStmt, parseTree));
+			break;
+		default:
+			break;
+	}
+
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+								   pstmt,
+								   queryString,
+								   readOnlyTree,
+								   context,
+								   params,
+								   queryEnv,
+								   dest,
+								   qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+								pstmt,
+								queryString,
+								readOnlyTree,
+								context,
+								params,
+								queryEnv,
+								dest,
+								qc);
+	}
+}
+
+extern void
+InitControlPlaneConnector()
+{
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = NeonProcessUtility;
+	RegisterXactCallback(NeonXactCallback, NULL);
+	RegisterSubXactCallback(NeonSubXactCallback, NULL);
+
+	DefineCustomStringVariable(
+							   "neon.console_url",
+							   "URL of the Neon Console, which will be forwarded changes to dbs and roles",
+							   NULL,
+							   &ConsoleURL,
+							   NULL,
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	DefineCustomBoolVariable(
+							 "neon.forward_ddl",
+							 "Controls whether to forward DDL to the control plane",
+							 NULL,
+							 &ForwardDDL,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
+
+	if (!jwt_token)
+	{
+		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
+	}
+
+	if (curl_global_init(CURL_GLOBAL_DEFAULT))
+	{
+		elog(ERROR, "Failed to initialize curl");
+	}
+	if ((CurlHandle = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
+	{
+		elog(ERROR, "Failed to initialize content header");
+	}
+
+	if (jwt_token)
+	{
+		char		auth_header[8192];
+
+		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
+		{
+			elog(ERROR, "Failed to initialize authorization header");
+		}
+	}
+}
--- a/pgxn/neon/control_plane_connector.h
+++ b/pgxn/neon/control_plane_connector.h
@@ -0,0 +1,6 @@
+#ifndef CONTROL_PLANE_CONNECTOR_H
+#define CONTROL_PLANE_CONNECTOR_H
+
+void		InitControlPlaneConnector();
+
+#endif
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -25,6 +25,7 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
+#include "control_plane_connector.h"

 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -34,7 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+	InitControlPlaneConnector();

+        // Important: This must happen after other parts of the extension
+        // are loaded, otherwise any settings to GUCs that were set before
+        // the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -254,20 +254,20 @@ nwp_register_gucs(void)

 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
-							"Timeout for reconnecting to offline wal acceptor.",
+							"Walproposer reconnects to offline safekeepers once in this interval.",
 							NULL,
 							&wal_acceptor_reconnect_timeout,
-							1000, 0, INT_MAX,	/* default, min, max */
+							5000, 0, INT_MAX,	/* default, min, max */
 							PGC_SIGHUP, /* context */
 							GUC_UNIT_MS,	/* flags */
 							NULL, NULL, NULL);

 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout for connection establishement and it's maintenance against safekeeper",
+							"Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.",
 							NULL,
 							&wal_acceptor_connection_timeout,
-							5000, 0, INT_MAX,
+							10000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -441,7 +441,7 @@ WalProposerPoll(void)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+					elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
 						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
@@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse))
 		return;

+	elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;

+	/* 
+	 * Note: it would be better to track the counter on per safekeeper basis,
+	 * but at worst walproposer would restart with 'term rejected', so leave as
+	 * is for now.
+	 */
 	++n_connected;
 	if (n_connected <= quorum)
 	{
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use utils::{project_git_version, sentry_init::init_sentry};

-use tracing::{error, info, warn};
+use tracing::{error, info, warn, Instrument};

 project_git_version!(GIT_VERSION);

@@ -141,7 +141,6 @@ async fn task_main(
        tokio::select! {
            accept_result = listener.accept() => {
                let (socket, peer_addr) = accept_result?;
-                info!("accepted postgres client connection from {peer_addr}");

                let session_id = uuid::Uuid::new_v4();
                let tls_config = Arc::clone(&tls_config);
@@ -149,18 +148,18 @@ async fn task_main(

                connections.spawn(
                    async move {
-                        info!("spawned a task for {peer_addr}");
-
                        socket
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(dest_suffix, tls_config, session_id, socket).await
+                        info!(%peer_addr, "serving");
+                        handle_client(dest_suffix, tls_config, socket).await
                    }
                    .unwrap_or_else(|e| {
                        // Acknowledge that the task has finished with an error.
                        error!("per-client task finished with an error: {e:#}");
-                    }),
+                    })
+                    .instrument(tracing::info_span!("handle_client", ?session_id))
                );
            }
            _ = cancellation_token.cancelled() => {
@@ -192,7 +191,6 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    let mut stream = PqStream::new(Stream::from_raw(raw_stream));

    let msg = stream.read_startup_packet().await?;
-    info!("received {msg:?}");
    use pq_proto::FeStartupPacket::*;

    match msg {
@@ -215,15 +213,19 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
            }
            Ok(raw.upgrade(tls_config).await?)
        }
-        _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
+        unexpected => {
+            info!(
+                ?unexpected,
+                "unexpected startup packet, rejecting connection"
+            );
+            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
+        }
    }
 }

-#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 async fn handle_client(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
-    session_id: uuid::Uuid,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
    let tls_stream = ssl_handshake(stream, tls_config).await?;
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -1,5 +1,5 @@
-///! Various stuff for dealing with the Neon Console.
-///! Later we might move some API wrappers here.
+//! Various stuff for dealing with the Neon Console.
+//! Later we might move some API wrappers here.

 /// Payloads used in the console's APIs.
 pub mod messages;
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,4 @@
-///! A group of high-level tests for connection establishing logic and auth.
+//! A group of high-level tests for connection establishing logic and auth.
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -19,8 +19,10 @@ use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;

+use crate::metrics::BROKER_ITERATION_TIMELINES;
 use crate::metrics::BROKER_PULLED_UPDATES;
 use crate::metrics::BROKER_PUSHED_UPDATES;
+use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;

@@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
                BROKER_PUSHED_UPDATES.inc();
            }
            let elapsed = now.elapsed();
-            // Log duration every second. Should be about 10MB of logs per day.
-            info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+
+            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
+            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+
+            if elapsed > push_interval / 2 {
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+            }
+
            sleep(push_interval).await;
        }
    };
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
    )
    .expect("Failed to register safekeeper_backup_errors_total counter")
 });
+pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_push_update_seconds",
+        "Seconds to push all timeline updates to the broker",
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
+});
+pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[
+    1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0,
+];
+pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_iteration_timelines",
+        "Count of timelines pushed to the broker in a single iteration",
+        TIMELINES_COUNT_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
+});

 pub const LABEL_UNKNOWN: &str = "unknown";

--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -634,7 +634,8 @@ where
        }

        // system_id will be updated on mismatch
-        if self.state.server.system_id != msg.system_id {
+        // sync-safekeepers doesn't know sysid and sends 0, ignore it
+        if self.state.server.system_id != msg.system_id && msg.system_id != 0 {
            if self.state.server.system_id != 0 {
                warn!(
                    "unexpected system ID arrived, got {}, expected {}",
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -379,6 +379,12 @@ impl Storage for PhysicalStorage {
            );
        }

+        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
+        // disk (this happens on each connect).
+        if end_pos == self.write_lsn {
+            return Ok(());
+        }
+
        // Close previously opened file, if any
        if let Some(mut unflushed_file) = self.file.take() {
            self.fdatasync_file(&mut unflushed_file)?;
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -1,5 +1,5 @@
 //
-// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
 //
 // The comment is updated on each run with the latest results.
 //
@@ -7,7 +7,7 @@
 // - uses: actions/github-script@v6
 //   with:
 //     script: |
-//       const script = require("./scripts/pr-comment-test-report.js")
+//       const script = require("./scripts/comment-test-report.js")
 //       await script({
 //         github,
 //         context,
@@ -35,8 +35,12 @@ class DefaultMap extends Map {
 module.exports = async ({ github, context, fetch, report }) => {
    // Marker to find the comment in the subsequent runs
    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // If we run the script in the PR or in the branch (main/release/...)
+    const isPullRequest = !!context.payload.pull_request
+    // Latest commit in PR or in the branch
+    const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
    // Let users know that the comment is updated automatically
-    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
    const githubActionsBotId = 41898282
    // Commend body itself
@@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => {

    commentBody += autoupdateNotice

-    const { data: comments } = await github.rest.issues.listComments({
-        issue_number: context.payload.number,
+    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
+    if (isPullRequest) {
+        createCommentFn  = github.rest.issues.createComment
+        listCommentsFn   = github.rest.issues.listComments
+        updateCommentFn  = github.rest.issues.updateComment
+        issueNumberOrSha = {
+            issue_number: context.payload.number,
+        }
+    } else {
+        updateCommentFn  = github.rest.repos.updateCommitComment
+        listCommentsFn   = github.rest.repos.listCommentsForCommit
+        createCommentFn  = github.rest.repos.createCommitComment
+        issueNumberOrSha = {
+            commit_sha: commitSha,
+        }
+    }
+
+    const { data: comments } = await listCommentsFn({
+        ...issueNumberOrSha,
        ...ownerRepoParams,
    })

    const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
    if (comment) {
-        await github.rest.issues.updateComment({
+        await updateCommentFn({
            comment_id: comment.id,
            body: commentBody,
            ...ownerRepoParams,
        })
    } else {
-        await github.rest.issues.createComment({
-            issue_number: context.payload.number,
+        await createCommentFn({
            body: commentBody,
+            ...issueNumberOrSha,
            ...ownerRepoParams,
        })
    }
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -156,7 +156,9 @@ class LLVM:
             profdata: Path,
             objects: List[str],
             sources: List[str],
-             demangler: Optional[Path] = None) -> None:
+             demangler: Optional[Path] = None,
+             output_file: Optional[Path] = None,
+             ) -> None:

        cwd = self.cargo.cwd
        objects = list(intersperse('-object', objects))
@@ -180,14 +182,18 @@ class LLVM:
            *objects,
            *sources,
        ]
-        subprocess.check_call(cmd, cwd=cwd)
+        if output_file is not None:
+            with output_file.open('w') as outfile:
+                subprocess.check_call(cmd, cwd=cwd, stdout=outfile)
+        else:
+            subprocess.check_call(cmd, cwd=cwd)

    def cov_report(self, **kwargs) -> None:
        self._cov(subcommand='report', **kwargs)

-    def cov_export(self, *, kind: str, **kwargs) -> None:
+    def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None:
        extras = (f'-format={kind}', )
-        self._cov(subcommand='export', *extras, **kwargs)
+        self._cov(subcommand='export', *extras, output_file=output_file, **kwargs)

    def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
        extras = [f'-format={kind}']
@@ -283,9 +289,12 @@ class TextReport(Report):
        self.llvm.cov_show(kind='text', **self._common_kwargs())


+@dataclass
 class LcovReport(Report):
+    output_file: Path
+
    def generate(self) -> None:
-        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
+        self.llvm.cov_export(kind='lcov',  output_file=self.output_file, **self._common_kwargs())


@dataclass
@@ -475,7 +484,7 @@ class State:
            'text':
            lambda: TextReport(**params),
            'lcov':
-            lambda: LcovReport(**params),
+            lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'),
            'summary':
            lambda: SummaryReport(**params),
            'github':
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -535,8 +535,8 @@ def export_timeline(


 def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 14
-    psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql")
+    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
+    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")

    old_pageserver_host = args.old_pageserver_host
    new_pageserver_host = args.new_pageserver_host
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -40,6 +40,9 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
+//
+// NB: this function is not async, but still must be run on a tokio runtime thread
+// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call.
 pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
    U: std::convert::TryInto<Uri>,
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -65,12 +65,19 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_bucket",
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
+    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
+    "pageserver_getpage_get_reconstruct_data_seconds_count",
+    "pageserver_getpage_get_reconstruct_data_seconds_sum",
    "pageserver_io_operations_bytes_total",
    "pageserver_io_operations_seconds_bucket",
    "pageserver_io_operations_seconds_count",
    "pageserver_io_operations_seconds_sum",
    "pageserver_last_record_lsn",
    "pageserver_materialized_cache_hits_total",
+    "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_read_num_fs_layers_bucket",
+    "pageserver_read_num_fs_layers_count",
+    "pageserver_read_num_fs_layers_sum",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -629,7 +629,7 @@ class NeonEnvBuilder:
        assert self.env is not None, "environment is not already initialized, call init() first"
        self.env.start()

-    def init_start(self) -> NeonEnv:
+    def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
        env = self.init_configs()
        self.start()

@@ -638,7 +638,9 @@ class NeonEnvBuilder:
        log.info(
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
-        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
+            tenant_id=env.initial_tenant, conf=initial_tenant_conf
+        )
        env.initial_timeline = initial_timeline
        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")

@@ -1444,11 +1446,12 @@ class NeonCli(AbstractNeonCli):
    def endpoint_create(
        self,
        branch_name: str,
+        pg_port: int,
+        http_port: int,
        endpoint_id: Optional[str] = None,
        tenant_id: Optional[TenantId] = None,
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
-        port: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1462,8 +1465,10 @@ class NeonCli(AbstractNeonCli):
        ]
        if lsn is not None:
            args.extend(["--lsn", str(lsn)])
-        if port is not None:
-            args.extend(["--port", str(port)])
+        if pg_port is not None:
+            args.extend(["--pg-port", str(pg_port)])
+        if http_port is not None:
+            args.extend(["--http-port", str(http_port)])
        if endpoint_id is not None:
            args.append(endpoint_id)
        if hot_standby:
@@ -1476,9 +1481,11 @@ class NeonCli(AbstractNeonCli):
    def endpoint_start(
        self,
        endpoint_id: str,
+        pg_port: int,
+        http_port: int,
+        safekeepers: Optional[List[int]] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
-        port: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1490,8 +1497,10 @@ class NeonCli(AbstractNeonCli):
        ]
        if lsn is not None:
            args.append(f"--lsn={lsn}")
-        if port is not None:
-            args.append(f"--port={port}")
+        args.extend(["--pg-port", str(pg_port)])
+        args.extend(["--http-port", str(http_port)])
+        if safekeepers is not None:
+            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
        if endpoint_id is not None:
            args.append(endpoint_id)

@@ -1583,13 +1592,11 @@ class NeonPageserver(PgProtocol):
            ".*serving compute connection task.*exited with error: Postgres connection error.*",
            ".*serving compute connection task.*exited with error: Connection reset by peer.*",
            ".*serving compute connection task.*exited with error: Postgres query error.*",
-            ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
-            ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
-            ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
+            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
            # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
-            ".*Connection aborted: connection error: unexpected message from server*",
+            ".*Connection aborted: unexpected message from server*",
            ".*kill_and_wait_impl.*: wait successful.*",
-            ".*Replication stream finished: db error:.*ending streaming to Some*",
+            ".*: db error:.*ending streaming to Some.*",
            ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
            # safekeeper connection can fail with this, in the window between timeline creation
@@ -1603,24 +1610,25 @@ class NeonPageserver(PgProtocol):
            # https://github.com/neondatabase/neon/issues/2442
            ".*could not remove ephemeral file.*No such file or directory.*",
            # FIXME: These need investigation
-            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
-            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
            ".*Removing intermediate uninit mark file.*",
-            # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
-            ".*sender is dropped while join handle is still alive.*",
            # Tenant::delete_timeline() can cause any of the four following errors.
            # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
            ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
+            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
            ".*task iteration took longer than the configured period.*",
            # this is until #3501
            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            # these can happen anytime we do compactions from background task and shutdown pageserver
+            r".*ERROR.*ancestor timeline \S+ is being stopped",
+            # this is expected given our collaborative shutdown approach for the UploadQueue
+            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
        ]

    def start(
@@ -1688,6 +1696,9 @@ class NeonPageserver(PgProtocol):
                else:
                    errors.append(line)

+        for error in errors:
+            log.info(f"not allowed error: {error.strip()}")
+
        assert not errors

    def log_contains(self, pattern: str) -> Optional[str]:
@@ -2280,17 +2291,24 @@ class Endpoint(PgProtocol):
    """An object representing a Postgres compute endpoint managed by the control plane."""

    def __init__(
-        self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        pg_port: int,
+        http_port: int,
+        check_stop_result: bool = True,
    ):
-        super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
+        super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
        self.env = env
        self.running = False
        self.branch_name: Optional[str] = None  # dubious
        self.endpoint_id: Optional[str] = None  # dubious, see asserts below
        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
        self.tenant_id = tenant_id
-        self.port = port
+        self.pg_port = pg_port
+        self.http_port = http_port
        self.check_stop_result = check_stop_result
+        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

    def create(
@@ -2320,7 +2338,8 @@ class Endpoint(PgProtocol):
            tenant_id=self.tenant_id,
            lsn=lsn,
            hot_standby=hot_standby,
-            port=self.port,
+            pg_port=self.pg_port,
+            http_port=self.http_port,
        )
        path = Path("endpoints") / self.endpoint_id / "pgdata"
        self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -2345,7 +2364,13 @@ class Endpoint(PgProtocol):

        log.info(f"Starting postgres endpoint {self.endpoint_id}")

-        self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port)
+        self.env.neon_cli.endpoint_start(
+            self.endpoint_id,
+            pg_port=self.pg_port,
+            http_port=self.http_port,
+            tenant_id=self.tenant_id,
+            safekeepers=self.active_safekeepers,
+        )
        self.running = True

        return self
@@ -2369,32 +2394,8 @@ class Endpoint(PgProtocol):
        return os.path.join(self.pg_data_dir_path(), "pg_twophase")

    def config_file_path(self) -> str:
-        """Path to postgresql.conf"""
-        return os.path.join(self.pg_data_dir_path(), "postgresql.conf")
-
-    def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint":
-        """
-        Adjust instance config for working with wal acceptors instead of
-        pageserver (pre-configured by CLI) directly.
-        """
-
-        # TODO: reuse config()
-        with open(self.config_file_path(), "r") as f:
-            cfg_lines = f.readlines()
-        with open(self.config_file_path(), "w") as f:
-            for cfg_line in cfg_lines:
-                # walproposer uses different application_name
-                if (
-                    "synchronous_standby_names" in cfg_line
-                    or
-                    # don't repeat safekeepers/wal_acceptors multiple times
-                    "neon.safekeepers" in cfg_line
-                ):
-                    continue
-                f.write(cfg_line)
-            f.write("synchronous_standby_names = 'walproposer'\n")
-            f.write("neon.safekeepers = '{}'\n".format(safekeepers))
-        return self
+        """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
+        return os.path.join(self.endpoint_path(), "postgresql.conf")

    def config(self, lines: List[str]) -> "Endpoint":
        """
@@ -2499,7 +2500,8 @@ class EndpointFactory:
        ep = Endpoint(
            self.env,
            tenant_id=tenant_id or self.env.initial_tenant,
-            port=self.env.port_distributor.get_port(),
+            pg_port=self.env.port_distributor.get_port(),
+            http_port=self.env.port_distributor.get_port(),
        )
        self.num_instances += 1
        self.endpoints.append(ep)
@@ -2524,7 +2526,8 @@ class EndpointFactory:
        ep = Endpoint(
            self.env,
            tenant_id=tenant_id or self.env.initial_tenant,
-            port=self.env.port_distributor.get_port(),
+            pg_port=self.env.port_distributor.get_port(),
+            http_port=self.env.port_distributor.get_port(),
        )

        if endpoint_id is None:
@@ -2907,6 +2910,7 @@ SKIP_FILES = frozenset(
        "pg_internal.init",
        "pg.log",
        "zenith.signal",
+        "pg_hba.conf",
        "postgresql.conf",
        "postmaster.opts",
        "postmaster.pid",
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -155,14 +155,14 @@ class PageserverHttpClient(requests.Session):
        return res_json

    def tenant_create(
-        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+        self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
    ) -> TenantId:
        if conf is not None:
            assert "new_tenant_id" not in conf.keys()
        res = self.post(
            f"http://localhost:{self.port}/v1/tenant",
            json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                "new_tenant_id": str(new_tenant_id),
                **(conf or {}),
            },
        )
@@ -293,13 +293,13 @@ class PageserverHttpClient(requests.Session):
        self,
        pg_version: PgVersion,
        tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
+        new_timeline_id: TimelineId,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
        **kwargs,
    ) -> Dict[Any, Any]:
        body: Dict[str, Any] = {
-            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+            "new_timeline_id": str(new_timeline_id),
            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
        }
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum):
    def __repr__(self) -> str:
        return f"'{self.value}'"

+    # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums
+    def __str__(self) -> str:
+        return self.value
+
    # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
    # sometime we need to do so in tests.
    @property
@@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser):
@pytest.fixture(scope="session")
 def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
    if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version commad-line argument"
+        version, source = v, "from --pg-version command-line argument"
    elif v := os.environ.get("DEFAULT_PG_VERSION"):
        version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
    else:
-        version, source = DEFAULT_VERSION, "default verson"
+        version, source = DEFAULT_VERSION, "default version"

    log.info(f"pg_version is {version} ({source})")
    yield version
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -20,6 +20,11 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
        test_name="test_attach_tenant_config",
    )
    env = neon_env_builder.init_start()
+
+    # eviction might be the first one after an attach to access the layers
+    env.pageserver.allowed_errors.append(
+        ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
+    )
    assert isinstance(env.remote_storage, LocalFsStorage)
    return env

@@ -158,6 +163,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
            "threshold": "23h",
        },
        "evictions_low_residence_duration_metric_threshold": "2days",
+        "gc_feedback": True,
        "gc_horizon": 23 * (1024 * 1024),
        "gc_period": "2h 13m",
        "image_creation_threshold": 7,
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -3,7 +3,7 @@ from contextlib import closing
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId


 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
@@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=tenant_token)
    ps.safe_psql("set FOO", password=pageserver_token)

-    new_timeline_id = env.neon_cli.create_branch(
-        "test_pageserver_auth", tenant_id=env.initial_tenant
-    )
-
    # tenant can create branches
    tenant_http_client.timeline_create(
        pg_version=env.pg_version,
        tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
    )
    # console can create branches for tenant
    pageserver_http_client.timeline_create(
        pg_version=env.pg_version,
        tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
    )

    # fail to create branch using token with different tenant_id
@@ -49,18 +47,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
        invalid_tenant_http_client.timeline_create(
            pg_version=env.pg_version,
            tenant_id=env.initial_tenant,
-            ancestor_timeline_id=new_timeline_id,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
        )

    # create tenant using management token
-    pageserver_http_client.tenant_create()
+    pageserver_http_client.tenant_create(TenantId.generate())

    # fail to create tenant using tenant token
    with pytest.raises(
        PageserverApiException,
        match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
    ):
-        tenant_http_client.tenant_create()
+        tenant_http_client.tenant_create(TenantId.generate())


 def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
            ".*failed to load metadata.*",
-            ".*could not load tenant.*load local timeline.*",
+            ".*load failed.*load local timeline.*",
        ]
    )

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -383,6 +383,9 @@ def check_neon_works(
    cli_target = NeonCli(config_target)

    # And the current binaries to launch computes
+    snapshot_config["neon_distrib_dir"] = str(neon_current_binpath)
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
    config_current = copy.copy(config)
    config_current.neon_binpath = neon_current_binpath
    cli_current = NeonCli(config_current)
@@ -391,7 +394,8 @@ def check_neon_works(
    request.addfinalizer(lambda: cli_target.raw_cli(["stop"]))

    pg_port = port_distributor.get_port()
-    cli_current.endpoint_start("main", port=pg_port)
+    http_port = port_distributor.get_port()
+    cli_current.endpoint_start("main", pg_port=pg_port, http_port=http_port)
    request.addfinalizer(lambda: cli_current.endpoint_stop("main"))

    connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -1,253 +0,0 @@
-import os
-from pathlib import Path
-from subprocess import TimeoutExpired
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin
-
-
-# Test that compute_ctl works and prints "--sync-safekeepers" logs.
-def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-    ctl = ComputeCtl(env)
-
-    env.neon_cli.create_branch("test_compute_ctl", "main")
-    endpoint = env.endpoints.create_start("test_compute_ctl")
-    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
-
-    with open(endpoint.config_file_path(), "r") as f:
-        cfg_lines = f.readlines()
-    cfg_map = {}
-    for line in cfg_lines:
-        if "=" in line:
-            k, v = line.split("=")
-            cfg_map[k] = v.strip("\n '\"")
-    log.info(f"postgres config: {cfg_map}")
-    pgdata = endpoint.pg_data_dir_path()
-    pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres")
-
-    endpoint.stop_and_destroy()
-
-    # stop_and_destroy removes the whole endpoint directory. Recreate it.
-    Path(pgdata).mkdir(parents=True)
-
-    spec = (
-        """
-{
-    "format_version": 1.0,
-
-    "timestamp": "2021-05-23T18:25:43.511Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
-
-    "cluster": {
-        "cluster_id": "test-cluster-42",
-        "name": "Neon Test",
-        "state": "restarted",
-        "roles": [
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "replica",
-                "vartype": "enum"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": """
-        + f'"{cfg_map["neon.safekeepers"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "32768",
-                "vartype": "integer"
-            },
-            {
-                "name": "port",
-                "value": """
-        + f'"{cfg_map["port"]}"'
-        + """,
-                "vartype": "integer"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "maintenance_work_mem",
-                "value": "65536",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_parallel_workers",
-                "value": "8",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_worker_processes",
-                "value": "8",
-                "vartype": "integer"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": """
-        + f'"{cfg_map["neon.tenant_id"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": """
-        + f'"{cfg_map["neon.timeline_id"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon",
-                "vartype": "string"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": """
-        + f'"{cfg_map["neon.pageserver_connstring"]}"'
-        + """,
-                "vartype": "string"
-            }
-        ]
-    },
-    "delta_operations": [
-    ]
-}
-"""
-    )
-
-    ps_connstr = cfg_map["neon.pageserver_connstring"]
-    log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}")
-
-    # run compute_ctl and wait for 10s
-    try:
-        ctl.raw_cli(
-            [
-                "--connstr",
-                "postgres://invalid/",
-                "--pgdata",
-                pgdata,
-                "--spec",
-                spec,
-                "--pgbin",
-                pg_bin_path,
-            ],
-            timeout=10,
-        )
-    except TimeoutExpired as exc:
-        ctl_logs = (exc.stderr or b"").decode("utf-8")
-        log.info(f"compute_ctl stderr:\n{ctl_logs}")
-
-    with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
-        start = "starting safekeepers syncing"
-        end = "safekeepers synced at LSN"
-        start_pos = ctl_logs.index(start)
-        assert start_pos != -1
-        end_pos = ctl_logs.index(end, start_pos)
-        assert end_pos != -1
-        sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)]
-        log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs)
-
-        # assert that --sync-safekeepers logs are present in the output
-        assert "connecting with node" in sync_safekeepers_logs
-        assert "connected with node" in sync_safekeepers_logs
-        assert "proposer connected to quorum (2)" in sync_safekeepers_logs
-        assert "got votes from majority (2)" in sync_safekeepers_logs
-        assert "sending elected msg to node" in sync_safekeepers_logs
-
-
-class ExternalProcessManager:
-    """
-    Context manager that kills a process with a pid file on exit.
-    """
-
-    def __init__(self, pid_file: Path):
-        self.path = pid_file
-        self.pid_file = open(pid_file, "r")
-        self.pid = int(self.pid_file.readline().strip())
-
-    def __enter__(self):
-        return self
-
-    def leave_alive(self):
-        self.pid_file.close()
-
-    def __exit__(self, _type, _value, _traceback):
-        import signal
-        import time
-
-        if self.pid_file.closed:
-            return
-
-        with self.pid_file:
-            try:
-                os.kill(self.pid, signal.SIGTERM)
-            except OSError as e:
-                if not self.path.is_file():
-                    return
-                log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
-                return
-
-            for _ in range(20):
-                if not self.path.is_file():
-                    return
-                time.sleep(0.2)
-
-            log.info("Process failed to stop after SIGTERM: {self.pid}")
-            os.kill(self.pid, signal.SIGKILL)
--- a/Show More
+++ b/Show More