review: remove Controller entity

review: add drain/fill_node comments
review: lift fill plan to a separate function
2026-02-03 02:30:37 +00:00 · 2024-06-17 16:23:14 +01:00 · 2024-06-17 16:23:14 +01:00 · 2024-06-17 16:23:11 +01:00 · 2024-06-17 11:46:39 +01:00 · 2024-06-17 11:45:49 +01:00
91 changed files with 1316 additions and 1341 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,7 +21,7 @@
 !patches/
 !pgxn/
 !proxy/
-!storage_scrubber/
+!s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,7 @@ jobs:
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,14 +410,14 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

-    - name: Benchmark pgvector queries
+    - name: Benchmark pgvector hnsw queries
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance/test_perf_pgvector_queries.py
+        test_selection: performance
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -55,7 +55,7 @@ jobs:
            exit 1
          fi

-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3

      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
      # The default value is ~/.docker
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -858,7 +858,7 @@ jobs:
          cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
-
+      
      - name: Build neon extensions test image
        if: matrix.version == 'v16'
        uses: docker/build-push-action@v5
@@ -965,7 +965,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v1
        with:
          fetch-depth: 0

@@ -1101,8 +1101,6 @@ jobs:
                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
            done
          done
-          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
-                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5109,6 +5109,54 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"

+[[package]]
+name = "s3_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "humantime",
+ "itertools",
+ "once_cell",
+ "pageserver",
+ "pageserver_api",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5753,6 +5801,7 @@ dependencies = [
 "r2d2",
 "reqwest 0.12.4",
 "routerify",
+ "scopeguard",
 "serde",
 "serde_json",
 "strum",
@@ -5765,54 +5814,6 @@ dependencies = [
 "workspace_hack",
 ]

-[[package]]
-name = "storage_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
@@ -5820,7 +5821,6 @@ dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
- "futures",
 "humantime",
 "hyper 0.14.26",
 "pageserver_api",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "storage_scrubber",
+    "s3_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -120,7 +120,7 @@ num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
-prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
+prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -141,7 +141,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.79.0
+ENV RUSTC_VERSION=1.78.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -246,17 +246,12 @@ COPY patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN if [ "$(uname -m)" = "x86_64" ]; then \
-        OPTFLAGS=" -march=x86-64 "; \
-    elif [ "$(uname -m)" = "aarch64" ]; then \
-        OPTFLAGS=""; \
-    fi && \
-    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
+    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
    patch -p1 < /pgvector.patch && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="$OPTFLAGS" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="$OPTFLAGS" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control

 #########################################################################################
@@ -984,7 +979,7 @@ RUN cd /ext-src/ && for f in *.tar.gz; \
    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
    || exit 1; rm -f $f; done
-RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch 
 # cmake is required for the h3 test
 RUN apt-get update && apt-get install -y cmake
 RUN patch -p1 < /ext-src/pg_hintplan.patch
--- a/2
+++ b/2
@@ -124,8 +124,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
-	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
            Arg::new("filecache-connstr")
                .long("filecache-connstr")
                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
                )
                .value_name("FILECACHE_CONNSTR"),
        )
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,7 +9,6 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
-futures.workspace = true
 humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,4 +1,3 @@
-use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
@@ -149,22 +148,6 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
-    // Drain a set of specified pageservers by moving the primary attachments to pageservers
-    // outside of the specified set.
-    Drain {
-        // Set of pageserver node ids to drain.
-        #[arg(long)]
-        nodes: Vec<NodeId>,
-        // Optional: migration concurrency (default is 8)
-        #[arg(long)]
-        concurrency: Option<usize>,
-        // Optional: maximum number of shards to migrate
-        #[arg(long)]
-        max_shards: Option<usize>,
-        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
-        #[arg(long)]
-        dry_run: Option<bool>,
-    },
 }

 #[derive(Parser)]
@@ -754,194 +737,6 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
-        Command::Drain {
-            nodes,
-            concurrency,
-            max_shards,
-            dry_run,
-        } => {
-            // Load the list of nodes, split them up into the drained and filled sets,
-            // and validate that draining is possible.
-            let node_descs = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut node_to_drain_descs = Vec::new();
-            let mut node_to_fill_descs = Vec::new();
-
-            for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
-                if to_drain {
-                    node_to_drain_descs.push(desc);
-                } else {
-                    node_to_fill_descs.push(desc);
-                }
-            }
-
-            if nodes.len() != node_to_drain_descs.len() {
-                anyhow::bail!("Drain requested for node which doesn't exist.")
-            }
-
-            node_to_fill_descs.retain(|desc| {
-                matches!(desc.availability, NodeAvailabilityWrapper::Active)
-                    && matches!(
-                        desc.scheduling,
-                        NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
-                    )
-            });
-
-            if node_to_fill_descs.is_empty() {
-                anyhow::bail!("There are no nodes to drain to")
-            }
-
-            // Set the node scheduling policy to draining for the nodes which
-            // we plan to drain.
-            for node_desc in node_to_drain_descs.iter() {
-                let req = NodeConfigureRequest {
-                    node_id: node_desc.id,
-                    availability: None,
-                    scheduling: Some(NodeSchedulingPolicy::Draining),
-                };
-
-                storcon_client
-                    .dispatch::<_, ()>(
-                        Method::PUT,
-                        format!("control/v1/node/{}/config", node_desc.id),
-                        Some(req),
-                    )
-                    .await?;
-            }
-
-            // Perform the drain: move each tenant shard scheduled on a node to
-            // be drained to a node which is being filled. A simple round robin
-            // strategy is used to pick the new node.
-            let tenants = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            let mut selected_node_idx = 0;
-
-            struct DrainMove {
-                tenant_shard_id: TenantShardId,
-                from: NodeId,
-                to: NodeId,
-            }
-
-            let mut moves: Vec<DrainMove> = Vec::new();
-
-            let shards = tenants
-                .into_iter()
-                .flat_map(|tenant| tenant.shards.into_iter());
-            for shard in shards {
-                if let Some(max_shards) = max_shards {
-                    if moves.len() >= max_shards {
-                        println!(
-                            "Stop planning shard moves since the requested maximum was reached"
-                        );
-                        break;
-                    }
-                }
-
-                let should_migrate = {
-                    if let Some(attached_to) = shard.node_attached {
-                        node_to_drain_descs
-                            .iter()
-                            .map(|desc| desc.id)
-                            .any(|id| id == attached_to)
-                    } else {
-                        false
-                    }
-                };
-
-                if !should_migrate {
-                    continue;
-                }
-
-                moves.push(DrainMove {
-                    tenant_shard_id: shard.tenant_shard_id,
-                    from: shard
-                        .node_attached
-                        .expect("We only migrate attached tenant shards"),
-                    to: node_to_fill_descs[selected_node_idx].id,
-                });
-                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
-            }
-
-            let total_moves = moves.len();
-
-            if dry_run == Some(true) {
-                println!("Dryrun requested. Planned {total_moves} moves:");
-                for mv in &moves {
-                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
-                }
-
-                return Ok(());
-            }
-
-            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
-            let mut stream = futures::stream::iter(moves)
-                .map(|mv| {
-                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
-                    async move {
-                        client
-                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                                Method::PUT,
-                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest {
-                                    tenant_shard_id: mv.tenant_shard_id,
-                                    node_id: mv.to,
-                                }),
-                            )
-                            .await
-                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
-                    }
-                })
-                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
-
-            let mut success = 0;
-            let mut failure = 0;
-
-            while let Some(res) = stream.next().await {
-                match res {
-                    Ok(_) => {
-                        success += 1;
-                    }
-                    Err((tenant_shard_id, from, to, error)) => {
-                        failure += 1;
-                        println!(
-                            "Failed to migrate {} from node {} to node {}: {}",
-                            tenant_shard_id, from, to, error
-                        );
-                    }
-                }
-
-                if (success + failure) % 20 == 0 {
-                    println!(
-                        "Processed {}/{} shards: {} succeeded, {} failed",
-                        success + failure,
-                        total_moves,
-                        success,
-                        failure
-                    );
-                }
-            }
-
-            println!(
-                "Processed {}/{} shards: {} succeeded, {} failed",
-                success + failure,
-                total_moves,
-                success,
-                failure
-            );
-        }
    }

    Ok(())
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,12 +159,12 @@ services:
      context: ./compute_wrapper/
      args:
        - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
-      - PG_VERSION=${PG_VERSION:-16}
+      - PG_VERSION=${PG_VERSION:-14}
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
@@ -194,7 +194,6 @@ services:
      - compute

  neon-test-extensions:
-    profiles: ["test-extensions"]
    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
    entrypoint:
      - "/bin/bash"
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -15,6 +15,7 @@ set -eux -o pipefail

 COMPOSE_FILE='docker-compose.yml'
 cd $(dirname $0)
+docker compose -f $COMPOSE_FILE 
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
 TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
@@ -25,16 +26,16 @@ export http_proxy https_proxy
 cleanup() {
    echo "show container information"
    docker ps
-    docker compose --profile test-extensions -f $COMPOSE_FILE logs
+    docker compose -f $COMPOSE_FILE logs
    echo "stop containers..."
-    docker compose --profile test-extensions -f $COMPOSE_FILE down
+    docker compose -f $COMPOSE_FILE down
 }

 for pg_version in 14 15 16; do
    echo "clean up containers if exists"
    cleanup
    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
@@ -46,7 +47,7 @@ for pg_version in 14 15 16; do
            cleanup
            exit 1
        fi
-        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
+        if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,18 +4,18 @@

 Currently we build two main images:

- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
+- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).

 And additional intermediate image:

 - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

-## Build pipeline
+## Building pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)
+1. `neondatabase/compute-tools` and `neondatabase/compute-node`

 2. `neondatabase/neon`

@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers

 You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 16 as of this writing)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=16 TAG=latest docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
@@ -47,31 +47,29 @@ Creating docker-compose_storage_broker_1       ... done

 2. connect compute node
 ```
-$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
-psql (16.3)
-Type "help" for help.
-
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ chmod 600 ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
-postgres=# insert into t values(1, 1);
+postgres=# insert into t values(1,1);
 INSERT 0 1
 postgres=# select * from t;
- key | value 
+ key | value
 -----+-------
   1 | 1
 (1 row)
-
 ```

 3. If you want to see the log, you can use `docker-compose logs` command.
 ```
 # check the container name you want to see
 $ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
-3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
 (...omit...)

-$ docker logs -f docker-compose_compute_1
+$ docker logs -f dockercompose_compute_1
 2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
 2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
 (...omit...)
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -209,6 +209,7 @@ pub enum NodeSchedulingPolicy {
    Active,
    Filling,
    Pause,
+    PauseForRestart,
    Draining,
 }

@@ -220,6 +221,7 @@ impl FromStr for NodeSchedulingPolicy {
            "active" => Ok(Self::Active),
            "filling" => Ok(Self::Filling),
            "pause" => Ok(Self::Pause),
+            "pause_for_restart" => Ok(Self::PauseForRestart),
            "draining" => Ok(Self::Draining),
            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
        }
@@ -233,6 +235,7 @@ impl From<NodeSchedulingPolicy> for String {
            Active => "active",
            Filling => "filling",
            Pause => "pause",
+            PauseForRestart => "pause_for_restart",
            Draining => "draining",
        }
        .to_string()
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -558,12 +558,6 @@ impl KeySpaceRandomAccum {
        self.ranges.push(range);
    }

-    pub fn add_keyspace(&mut self, keyspace: KeySpace) {
-        for range in keyspace.ranges {
-            self.add_range(range);
-        }
-    }
-
    pub fn to_keyspace(mut self) -> KeySpace {
        let mut ranges = Vec::new();
        if !self.ranges.is_empty() {
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [dependencies]
 hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,9 +2,10 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{
+    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
+};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -349,12 +350,19 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
-    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
-        Ok(_) => {}
-        Err(CalculateSyntheticSizeError::Cancelled) => {}
-        Err(e) => {
-            let tenant_shard_id = tenant.tenant_shard_id();
-            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
-        }
+    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
+        return;
+    };
+
+    // this error can be returned if timeline is shutting down, but it does not
+    // mean the synthetic size worker should terminate.
+    let shutting_down = matches!(
+        e.downcast_ref::<PageReconstructError>(),
+        Some(PageReconstructError::Cancelled)
+    );
+
+    if !shutting_down {
+        let tenant_shard_id = tenant.tenant_shard_id();
+        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1135,10 +1135,7 @@ async fn tenant_size_handler(
            &ctx,
        )
        .await
-        .map_err(|e| match e {
-            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
-            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
-        })?;
+        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
    let accepts_html = headers
@@ -1146,7 +1143,9 @@ async fn tenant_size_handler(
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs.calculate_model();
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
        let size = storage_model.calculate();

        // If request header expects html, return html
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -919,14 +919,6 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

-        #[cfg(test)]
-        {
-            let guard = self.extra_test_dense_keyspace.load();
-            for kr in &guard.ranges {
-                result.add_range(kr.clone());
-            }
-        }
-
        Ok((
            result.to_keyspace(),
            /* AUX sparse key space */
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -509,24 +509,11 @@ pub(crate) enum GcError {
    #[error(transparent)]
    Remote(anyhow::Error),

-    // An error reading while calculating GC cutoffs
-    #[error(transparent)]
-    GcCutoffs(PageReconstructError),
-
    // If GC was invoked for a particular timeline, this error means it didn't exist
    #[error("timeline not found")]
    TimelineNotFound,
 }

-impl From<PageReconstructError> for GcError {
-    fn from(value: PageReconstructError) -> Self {
-        match value {
-            PageReconstructError::Cancelled => Self::TimelineCancelled,
-            other => Self::GcCutoffs(other),
-        }
-    }
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1046,6 +1033,7 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client,
+                    deletion_queue_client: self.deletion_queue_client.clone(),
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                },
                ctx,
@@ -1071,6 +1059,7 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
+                self.deletion_queue_client.clone(),
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
@@ -2932,9 +2921,17 @@ impl Tenant {
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
-            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-            assert!(old.is_none());
+            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
+
+            match res {
+                Ok(cutoffs) => {
+                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+                    assert!(old.is_none());
+                }
+                Err(e) => {
+                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
+                }
+            }
        }

        if !self.is_active() || self.cancel.is_cancelled() {
@@ -3446,6 +3443,7 @@ impl Tenant {
        );
        TimelineResources {
            remote_client,
+            deletion_queue_client: self.deletion_queue_client.clone(),
            timeline_get_throttle: self.timeline_get_throttle.clone(),
        }
    }
@@ -3555,7 +3553,7 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
+    ) -> anyhow::Result<size::ModelInputs> {
        let logical_sizes_at_once = self
            .conf
            .concurrent_tenant_size_logical_size_queries
@@ -3570,8 +3568,8 @@ impl Tenant {
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = tokio::select! {
            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
-            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
+            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
+            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
        };

        size::gather_inputs(
@@ -3595,10 +3593,10 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<u64, size::CalculateSyntheticSizeError> {
+    ) -> anyhow::Result<u64> {
        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

-        let size = inputs.calculate();
+        let size = inputs.calculate()?;

        self.set_cached_synthetic_size(size);

@@ -4043,7 +4041,6 @@ mod tests {
    use crate::repository::{Key, Value};
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
-    use crate::walrecord::NeonWalRecord;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
@@ -5267,9 +5264,6 @@ mod tests {
        let cancel = CancellationToken::new();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
-        let mut test_key_end = test_key;
-        test_key_end.field6 = NUM_KEYS as u32;
-        tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));

        let mut keyspace = KeySpaceAccum::new();

@@ -6229,8 +6223,8 @@ mod tests {

        let cancel = CancellationToken::new();

-        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
        let mut test_key = base_key;
        let mut lsn = Lsn(0x10);

@@ -6335,7 +6329,6 @@ mod tests {
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
-        tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));

        let child = tenant
            .branch_timeline_test_with_layers(
@@ -6708,8 +6701,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
+    async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction")?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
@@ -6864,79 +6857,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn test_neon_test_record() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_neon_test_record")?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
-            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
-            ),
-            (
-                get_key(1),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
-            ),
-            (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(2),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
-            ),
-            (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(3),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
-            ),
-            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
-            (
-                get_key(4),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
-            ),
-        ];
-        let image1 = vec![(get_key(1), "0x10".into())];
-
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![delta1],              // delta layers
-                vec![(Lsn(0x10), image1)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        assert_eq!(
-            tline.get(get_key(1), Lsn(0x50), &ctx).await?,
-            Bytes::from_static(b"0x10,0x20,0x30")
-        );
-        assert_eq!(
-            tline.get(get_key(2), Lsn(0x50), &ctx).await?,
-            Bytes::from_static(b"0x10,0x20,0x30")
-        );
-        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
-        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
        // cover our access to local storage.
        let Ok(_guard) = self.secondary_state.gate.enter() else {
            // Shutting down
-            return Err(UpdateError::Cancelled);
+            return Ok(());
        };

        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Err(UpdateError::Cancelled);
+                return Ok(());
            }

            // Existing on-disk layers: just update their access time.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -10,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -42,44 +43,6 @@ pub struct SegmentMeta {
    pub kind: LsnKind,
 }

-#[derive(thiserror::Error, Debug)]
-pub(crate) enum CalculateSyntheticSizeError {
-    /// Something went wrong internally to the calculation of logical size at a particular branch point
-    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
-    LogicalSize {
-        timeline_id: TimelineId,
-        lsn: Lsn,
-        error: CalculateLogicalSizeError,
-    },
-
-    /// Something went wrong internally when calculating GC parameters at start of size calculation
-    #[error(transparent)]
-    GcInfo(GcError),
-
-    /// Totally unexpected errors, like panics joining a task
-    #[error(transparent)]
-    Fatal(anyhow::Error),
-
-    /// The LSN we are trying to calculate a size at no longer exists at the point we query it
-    #[error("Could not find size at {lsn} in timeline {timeline_id}")]
-    LsnNotFound { timeline_id: TimelineId, lsn: Lsn },
-
-    /// Tenant shut down while calculating size
-    #[error("Cancelled")]
-    Cancelled,
-}
-
-impl From<GcError> for CalculateSyntheticSizeError {
-    fn from(value: GcError) -> Self {
-        match value {
-            GcError::TenantCancelled | GcError::TimelineCancelled => {
-                CalculateSyntheticSizeError::Cancelled
-            }
-            other => CalculateSyntheticSizeError::GcInfo(other),
-        }
-    }
-}
-
 impl SegmentMeta {
    fn size_needed(&self) -> bool {
        match self.kind {
@@ -153,9 +116,12 @@ pub(super) async fn gather_inputs(
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
-) -> Result<ModelInputs, CalculateSyntheticSizeError> {
+) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant.refresh_gc_info(cancel, ctx).await?;
+    tenant
+        .refresh_gc_info(cancel, ctx)
+        .await
+        .context("Failed to refresh gc_info before gathering inputs")?;

    // Collect information about all the timelines
    let mut timelines = tenant.list_timelines();
@@ -361,12 +327,6 @@ pub(super) async fn gather_inputs(
    )
    .await?;

-    if tenant.cancel.is_cancelled() {
-        // If we're shutting down, return an error rather than a sparse result that might include some
-        // timelines from before we started shutting down
-        return Err(CalculateSyntheticSizeError::Cancelled);
-    }
-
    Ok(ModelInputs {
        segments,
        timeline_inputs,
@@ -385,7 +345,7 @@ async fn fill_logical_sizes(
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
-) -> Result<(), CalculateSyntheticSizeError> {
+) -> anyhow::Result<()> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
        timelines
            .iter()
@@ -427,7 +387,7 @@ async fn fill_logical_sizes(
    }

    // Perform the size lookups
-    let mut have_any_error = None;
+    let mut have_any_error = false;
    while let Some(res) = joinset.join_next().await {
        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
@@ -438,36 +398,21 @@ async fn fill_logical_sizes(
            Err(join_error) => {
                // cannot really do anything, as this panic is likely a bug
                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(join_error)
-                        .context("task that calls spawn_ondemand_logical_size_calculation"),
-                ));
+                have_any_error = true;
            }
            Ok(Err(recv_result_error)) => {
                // cannot really do anything, as this panic is likely a bug
                error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
-                    anyhow::anyhow!(recv_result_error)
-                        .context("Receiving logical size query result"),
-                ));
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if matches!(error, CalculateLogicalSizeError::Cancelled) {
-                    // Skip this: it's okay if one timeline among many is shutting down while we
-                    // calculate inputs for the overall tenant.
-                    continue;
-                } else {
+                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
                    warn!(
                        timeline_id=%timeline.timeline_id,
                        "failed to calculate logical size at {lsn}: {error:#}"
                    );
-                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
-                        timeline_id: timeline.timeline_id,
-                        lsn,
-                        error,
-                    });
                }
+                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -481,10 +426,10 @@ async fn fill_logical_sizes(
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

-    if let Some(error) = have_any_error {
+    if have_any_error {
        // we cannot complete this round, because we are missing data.
        // we have however cached all we were able to request calculation on.
-        return Err(error);
+        anyhow::bail!("failed to calculate some logical_sizes");
    }

    // Insert the looked up sizes to the Segments
@@ -499,29 +444,32 @@ async fn fill_logical_sizes(
        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
            seg.segment.size = Some(*size);
        } else {
-            return Err(CalculateSyntheticSizeError::LsnNotFound { timeline_id, lsn });
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
        }
    }
    Ok(())
 }

 impl ModelInputs {
-    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
        // Convert SegmentMetas into plain Segments
-        StorageModel {
+        let storage = StorageModel {
            segments: self
                .segments
                .iter()
                .map(|seg| seg.segment.clone())
                .collect(),
-        }
+        };
+
+        Ok(storage)
    }

    // calculate total project size
-    pub fn calculate(&self) -> u64 {
-        let storage = self.calculate_model();
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        let storage = self.calculate_model()?;
        let sizes = storage.calculate();
-        sizes.total_size
+
+        Ok(sizes.total_size)
    }
 }

@@ -708,7 +656,7 @@ fn verify_size_for_multiple_branches() {
 "#;
    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

-    assert_eq!(inputs.calculate(), 37_851_408);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }

 #[test]
@@ -763,7 +711,7 @@ fn verify_size_for_one_branch() {

    let model: ModelInputs = serde_json::from_str(doc).unwrap();

-    let res = model.calculate_model().calculate();
+    let res = model.calculate_model().unwrap().calculate();

    println!("calculated synthetic size: {}", res.total_size);
    println!("result: {:?}", serde_json::to_string(&res.segments));
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: KeySpaceRandomAccum,
+    target_keyspace: Vec<KeySpace>,
 }

 impl LayerFringe {
@@ -342,13 +342,17 @@ impl LayerFringe {
                _,
                LayerKeyspace {
                    layer,
-                    mut target_keyspace,
+                    target_keyspace,
                },
-            )) => Some((
-                layer,
-                target_keyspace.consume_keyspace(),
-                read_desc.lsn_range,
-            )),
+            )) => {
+                let mut keyspace = KeySpaceRandomAccum::new();
+                for ks in target_keyspace {
+                    for part in ks.ranges {
+                        keyspace.add_range(part);
+                    }
+                }
+                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
+            }
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -363,18 +367,16 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.add_keyspace(keyspace);
+                entry.get_mut().target_keyspace.push(keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
                    lsn_range,
                    layer_id: layer_id.clone(),
                });
-                let mut accum = KeySpaceRandomAccum::new();
-                accum.add_keyspace(keyspace);
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: accum,
+                    target_keyspace: vec![keyspace],
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -219,6 +219,7 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
+    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -784,6 +785,7 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
+            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -909,7 +911,7 @@ impl DeltaLayerInner {

        let reads = Self::plan_reads(
            &keyspace,
-            lsn_range.clone(),
+            lsn_range,
            data_end_offset,
            index_reader,
            planner,
@@ -922,7 +924,7 @@ impl DeltaLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);
+        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);

        Ok(())
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -62,7 +62,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -76,6 +75,7 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
+use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -205,6 +205,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
+    pub deletion_queue_client: DeletionQueueClient,
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
@@ -425,14 +426,6 @@ pub struct Timeline {

    /// Indicate whether aux file v2 storage is enabled.
    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
-
-    /// Some test cases directly place keys into the timeline without actually modifying the directory
-    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
-    /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
-    /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
-    /// in the future, add `extra_test_sparse_keyspace` if necessary.
-    #[cfg(test)]
-    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 }

 pub struct WalReceiverInfo {
@@ -2351,9 +2344,6 @@ impl Timeline {
                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),

                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
-
-                #[cfg(test)]
-                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -4822,7 +4812,7 @@ impl Timeline {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<GcCutoffs, PageReconstructError> {
+    ) -> anyhow::Result<GcCutoffs> {
        let _timer = self
            .metrics
            .find_gc_cutoffs_histo
@@ -5572,13 +5562,6 @@ impl Timeline {
        }
        Ok(layers)
    }
-
-    #[cfg(test)]
-    pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
-        let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
-        keyspace.merge(&ks);
-        self.extra_test_dense_keyspace.store(Arc::new(keyspace));
-    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,6 +11,7 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
+    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -262,6 +263,7 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
+        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -272,6 +274,7 @@ impl DeleteTimelineFlow {
                None, // Ancestor is not needed for deletion.
                TimelineResources {
                    remote_client,
+                    deletion_queue_client,
                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -49,19 +49,6 @@ pub enum NeonWalRecord {
        file_path: String,
        content: Option<Bytes>,
    },
-
-    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
-    #[cfg(test)]
-    Test {
-        /// Append a string to the image.
-        append: String,
-        /// Clear the image before appending.
-        clear: bool,
-        /// Treat this record as an init record. `clear` should be set to true if this field is set
-        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
-        /// its references in `timeline.rs`.
-        will_init: bool,
-    },
 }

 impl NeonWalRecord {
@@ -71,39 +58,11 @@ impl NeonWalRecord {
        // If you change this function, you'll also need to change ValueBytes::will_init
        match self {
            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-            #[cfg(test)]
-            NeonWalRecord::Test { will_init, .. } => *will_init,
+
            // None of the special neon record types currently initialize the page
            _ => false,
        }
    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
-        }
-    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_clear() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: false,
-        }
-    }
-
-    #[cfg(test)]
-    pub(crate) fn wal_init() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: true,
-        }
-    }
 }

 /// DecodedBkpBlock represents per-page data contained in a WAL record.
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -244,20 +244,6 @@ pub(crate) fn apply_in_neon(
            let mut writer = page.writer();
            dir.ser_into(&mut writer)?;
        }
-        #[cfg(test)]
-        NeonWalRecord::Test {
-            append,
-            clear,
-            will_init,
-        } => {
-            if *will_init {
-                assert!(*clear, "init record must be clear to ensure correctness");
-            }
-            if *clear {
-                page.clear();
-            }
-            page.put_slice(append.as_bytes());
-        }
    }
    Ok(())
 }
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -1,8 +1,19 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
 diff --git a/src/hnswbuild.c b/src/hnswbuild.c
-index dcfb2bd..d5189ee 100644
+index 680789b..ec54dea 100644
 --- a/src/hnswbuild.c
 +++ b/src/hnswbuild.c
-@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
 
 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
 
@@ -20,7 +31,7 @@ index dcfb2bd..d5189ee 100644
 	/* Close relations within worker */
 	index_close(indexRel, indexLockmode);
 	table_close(heapRel, heapLockmode);
-@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
 	SeedRandom(42);
 #endif
 
@@ -32,13 +43,14 @@ index dcfb2bd..d5189ee 100644
 
 	BuildGraph(buildstate, forkNum);
 
-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
 +#ifdef NEON_SMGR
 +	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
-+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
- 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
 +#ifdef NEON_SMGR
 +		{
 +#if PG_VERSION_NUM >= 160000
@@ -48,7 +60,7 @@ index dcfb2bd..d5189ee 100644
 +#endif
 +
 +			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
-+									   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
 +			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
 +		}
 +#endif
@@ -57,6 +69,10 @@ index dcfb2bd..d5189ee 100644
 +#ifdef NEON_SMGR
 +	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
- 
+
 	FreeBuildState(buildstate);
 }
+ 
+-- 
+2.39.2
+
--- a/pgxn/.dir-locals.el
+++ b/pgxn/.dir-locals.el
@@ -1,19 +0,0 @@
-;; see also src/tools/editors/emacs.samples for more complete settings
-
-((c-mode . ((c-basic-offset . 4)
-            (c-file-style . "bsd")
-            (fill-column . 78)
-            (indent-tabs-mode . t)
-            (tab-width . 4)))
- (nxml-mode . ((fill-column . 78)
-               (indent-tabs-mode . nil)))
- (perl-mode . ((perl-indent-level . 4)
-               (perl-continued-statement-offset . 2)
-               (perl-continued-brace-offset . -2)
-               (perl-brace-offset . 0)
-               (perl-brace-imaginary-offset . 0)
-               (perl-label-offset . -2)
-               (indent-tabs-mode . t)
-               (tab-width . 4)))
- (sgml-mode . ((fill-column . 78)
-               (indent-tabs-mode . nil))))
--- a/pgxn/.editorconfig
+++ b/pgxn/.editorconfig
@@ -1,14 +0,0 @@
-root = true
-
-[*.{c,h,l,y,pl,pm}]
-indent_style = tab
-indent_size = tab
-tab_width = 4
-
-[*.{sgml,xml}]
-indent_style = space
-indent_size = 1
-
-[*.xsl]
-indent_style = space
-indent_size = 2
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,7 +19,6 @@
 #include "catalog/pg_type.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
-#include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/procsignal.h"
@@ -281,7 +280,6 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
-	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -3112,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		request_lsn = UINT64_MAX;

 	/*
-	 * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
+	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
 	 * segment has not changed since the basebackup, because in order to
 	 * modify it, we would have had to download it already. And once
 	 * downloaded, we never evict SLRU segments from local disk.
 	 */
-	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());
+	not_modified_since = GetRedoStartLsn();

 	SlruKind kind;

--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -24,12 +24,8 @@
 #include "walproposer.h"

 static NeonWALReader *wal_reader = NULL;
-
-struct WalSnd;
-extern struct WalSnd *MyWalSnd;
 extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
 extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
-extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);

 static XLogRecPtr
 NeonWALReadWaitForWAL(XLogRecPtr loc)
@@ -40,28 +36,7 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
 		CHECK_FOR_INTERRUPTS();
 	}

-	// Walsender sends keepalives and stuff, so better use its normal wait
-	if (MyWalSnd != NULL)
-		return WalSndWaitForWal(loc);
-
-	for (;;)
-	{
-		XLogRecPtr flush_ptr;
-		if (!RecoveryInProgress())
-#if PG_VERSION_NUM >= 150000
-			flush_ptr = GetFlushRecPtr(NULL);
-#else
-			flush_ptr = GetFlushRecPtr();
-#endif
-		else
-			flush_ptr = GetXLogReplayRecPtr(NULL);
-
-		if (loc <= flush_ptr)
-			return flush_ptr;
-
-		CHECK_FOR_INTERRUPTS();
-		pg_usleep(1000);
-	}
+	return WalSndWaitForWal(loc);
 }

 static int
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,183 +1,16 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
-use std::fmt::{self, Display};
+use std::fmt;

 use crate::auth::IpPattern;

 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
-use crate::proxy::retry::ShouldRetry;

 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
 pub struct ConsoleError {
    pub error: Box<str>,
-    #[serde(skip)]
-    pub http_status_code: http::StatusCode,
-    pub status: Option<Status>,
-}
-
-impl ConsoleError {
-    pub fn get_reason(&self) -> Reason {
-        self.status
-            .as_ref()
-            .and_then(|s| s.details.error_info.as_ref())
-            .map(|e| e.reason)
-            .unwrap_or(Reason::Unknown)
-    }
-    pub fn get_user_facing_message(&self) -> String {
-        use super::provider::errors::REQUEST_FAILED;
-        self.status
-            .as_ref()
-            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.clone().into())
-            .unwrap_or_else(|| {
-                // Ask @neondatabase/control-plane for review before adding more.
-                match self.http_status_code {
-                    http::StatusCode::NOT_FOUND => {
-                        // Status 404: failed to get a project-related resource.
-                        format!("{REQUEST_FAILED}: endpoint cannot be found")
-                    }
-                    http::StatusCode::NOT_ACCEPTABLE => {
-                        // Status 406: endpoint is disabled (we don't allow connections).
-                        format!("{REQUEST_FAILED}: endpoint is disabled")
-                    }
-                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
-                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
-                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
-                    }
-                    _ => REQUEST_FAILED.to_owned(),
-                }
-            })
-    }
-}
-
-impl Display for ConsoleError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let msg = self
-            .status
-            .as_ref()
-            .and_then(|s| s.details.user_facing_message.as_ref())
-            .map(|m| m.message.as_ref())
-            .unwrap_or_else(|| &self.error);
-        write!(f, "{}", msg)
-    }
-}
-
-impl ShouldRetry for ConsoleError {
-    fn could_retry(&self) -> bool {
-        if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
-            // retry some temporary failures because the compute was in a bad state
-            // (bad request can be returned when the endpoint was in transition)
-            return match &self {
-                ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
-                    ..
-                } => true,
-                // don't retry when quotas are exceeded
-                ConsoleError {
-                    http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    ref error,
-                    ..
-                } => !error.contains("compute time quota of non-primary branches is exceeded"),
-                // locked can be returned when the endpoint was in transition
-                // or when quotas are exceeded. don't retry when quotas are exceeded
-                ConsoleError {
-                    http_status_code: http::StatusCode::LOCKED,
-                    ref error,
-                    ..
-                } => {
-                    !error.contains("quota exceeded")
-                        && !error.contains("the limit for current plan reached")
-                }
-                _ => false,
-            };
-        }
-
-        // retry if the response has a retry delay
-        if let Some(retry_info) = self
-            .status
-            .as_ref()
-            .and_then(|s| s.details.retry_info.as_ref())
-        {
-            retry_info.retry_delay_ms > 0
-        } else {
-            false
-        }
-    }
-}
-
-#[derive(Debug, Deserialize)]
-pub struct Status {
-    pub code: Box<str>,
-    pub message: Box<str>,
-    pub details: Details,
-}
-
-#[derive(Debug, Deserialize)]
-pub struct Details {
-    pub error_info: Option<ErrorInfo>,
-    pub retry_info: Option<RetryInfo>,
-    pub user_facing_message: Option<UserFacingMessage>,
-}
-
-#[derive(Debug, Deserialize)]
-pub struct ErrorInfo {
-    pub reason: Reason,
-    // Schema could also have `metadata` field, but it's not structured. Skip it for now.
-}
-
-#[derive(Clone, Copy, Debug, Deserialize, Default)]
-pub enum Reason {
-    #[serde(rename = "ROLE_PROTECTED")]
-    RoleProtected,
-    #[serde(rename = "RESOURCE_NOT_FOUND")]
-    ResourceNotFound,
-    #[serde(rename = "PROJECT_NOT_FOUND")]
-    ProjectNotFound,
-    #[serde(rename = "ENDPOINT_NOT_FOUND")]
-    EndpointNotFound,
-    #[serde(rename = "BRANCH_NOT_FOUND")]
-    BranchNotFound,
-    #[serde(rename = "RATE_LIMIT_EXCEEDED")]
-    RateLimitExceeded,
-    #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
-    NonPrimaryBranchComputeTimeExceeded,
-    #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
-    ActiveTimeQuotaExceeded,
-    #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
-    ComputeTimeQuotaExceeded,
-    #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
-    WrittenDataQuotaExceeded,
-    #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
-    DataTransferQuotaExceeded,
-    #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
-    LogicalSizeQuotaExceeded,
-    #[default]
-    #[serde(other)]
-    Unknown,
-}
-
-impl Reason {
-    pub fn is_not_found(&self) -> bool {
-        matches!(
-            self,
-            Reason::ResourceNotFound
-                | Reason::ProjectNotFound
-                | Reason::EndpointNotFound
-                | Reason::BranchNotFound
-        )
-    }
-}
-
-#[derive(Debug, Deserialize)]
-pub struct RetryInfo {
-    pub retry_delay_ms: u64,
-}
-
-#[derive(Debug, Deserialize)]
-pub struct UserFacingMessage {
-    pub message: Box<str>,
 }

 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -25,8 +25,8 @@ use tracing::info;

 pub mod errors {
    use crate::{
-        console::messages::{self, ConsoleError},
        error::{io_error, ReportableError, UserFacingError},
+        http,
        proxy::retry::ShouldRetry,
    };
    use thiserror::Error;
@@ -34,14 +34,17 @@ pub mod errors {
    use super::ApiLockError;

    /// A go-to error message which doesn't leak any detail.
-    pub const REQUEST_FAILED: &str = "Console request failed";
+    const REQUEST_FAILED: &str = "Console request failed";

    /// Common console API error.
    #[derive(Debug, Error)]
    pub enum ApiError {
        /// Error returned by the console itself.
-        #[error("{REQUEST_FAILED} with {0}")]
-        Console(ConsoleError),
+        #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
+        Console {
+            status: http::StatusCode,
+            text: Box<str>,
+        },

        /// Various IO errors like broken pipe or malformed payload.
        #[error("{REQUEST_FAILED}: {0}")]
@@ -50,11 +53,11 @@ pub mod errors {

    impl ApiError {
        /// Returns HTTP status code if it's the reason for failure.
-        pub fn get_reason(&self) -> messages::Reason {
+        pub fn http_status_code(&self) -> Option<http::StatusCode> {
            use ApiError::*;
            match self {
-                Console(e) => e.get_reason(),
-                _ => messages::Reason::Unknown,
+                Console { status, .. } => Some(*status),
+                _ => None,
            }
        }
    }
@@ -64,7 +67,22 @@ pub mod errors {
            use ApiError::*;
            match self {
                // To minimize risks, only select errors are forwarded to users.
-                Console(c) => c.get_user_facing_message(),
+                // Ask @neondatabase/control-plane for review before adding more.
+                Console { status, .. } => match *status {
+                    http::StatusCode::NOT_FOUND => {
+                        // Status 404: failed to get a project-related resource.
+                        format!("{REQUEST_FAILED}: endpoint cannot be found")
+                    }
+                    http::StatusCode::NOT_ACCEPTABLE => {
+                        // Status 406: endpoint is disabled (we don't allow connections).
+                        format!("{REQUEST_FAILED}: endpoint is disabled")
+                    }
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
+                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
+                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
+                    }
+                    _ => REQUEST_FAILED.to_owned(),
+                },
                _ => REQUEST_FAILED.to_owned(),
            }
        }
@@ -73,56 +91,29 @@ pub mod errors {
    impl ReportableError for ApiError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                ApiError::Console(e) => {
-                    use crate::error::ErrorKind::*;
-                    match e.get_reason() {
-                        crate::console::messages::Reason::RoleProtected => User,
-                        crate::console::messages::Reason::ResourceNotFound => User,
-                        crate::console::messages::Reason::ProjectNotFound => User,
-                        crate::console::messages::Reason::EndpointNotFound => User,
-                        crate::console::messages::Reason::BranchNotFound => User,
-                        crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
-                        crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
-                            User
-                        }
-                        crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
-                        crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
-                        crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
-                        crate::console::messages::Reason::DataTransferQuotaExceeded => User,
-                        crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
-                        crate::console::messages::Reason::Unknown => match &e {
-                            ConsoleError {
-                                http_status_code:
-                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                                ..
-                            } => crate::error::ErrorKind::User,
-                            ConsoleError {
-                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                                error,
-                                ..
-                            } if error.contains(
-                                "compute time quota of non-primary branches is exceeded",
-                            ) =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::LOCKED,
-                                error,
-                                ..
-                            } if error.contains("quota exceeded")
-                                || error.contains("the limit for current plan reached") =>
-                            {
-                                crate::error::ErrorKind::User
-                            }
-                            ConsoleError {
-                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                                ..
-                            } => crate::error::ErrorKind::ServiceRateLimit,
-                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
-                        },
-                    }
+                ApiError::Console {
+                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                    ..
+                } => crate::error::ErrorKind::User,
+                ApiError::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    text,
+                } if text.contains("compute time quota of non-primary branches is exceeded") => {
+                    crate::error::ErrorKind::User
                }
+                ApiError::Console {
+                    status: http::StatusCode::LOCKED,
+                    text,
+                } if text.contains("quota exceeded")
+                    || text.contains("the limit for current plan reached") =>
+                {
+                    crate::error::ErrorKind::User
+                }
+                ApiError::Console {
+                    status: http::StatusCode::TOO_MANY_REQUESTS,
+                    ..
+                } => crate::error::ErrorKind::ServiceRateLimit,
+                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
@@ -133,7 +124,31 @@ pub mod errors {
            match self {
                // retry some transport errors
                Self::Transport(io) => io.could_retry(),
-                Self::Console(e) => e.could_retry(),
+                // retry some temporary failures because the compute was in a bad state
+                // (bad request can be returned when the endpoint was in transition)
+                Self::Console {
+                    status: http::StatusCode::BAD_REQUEST,
+                    ..
+                } => true,
+                // don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref text,
+                } => !text.contains("compute time quota of non-primary branches is exceeded"),
+                // locked can be returned when the endpoint was in transition
+                // or when quotas are exceeded. don't retry when quotas are exceeded
+                Self::Console {
+                    status: http::StatusCode::LOCKED,
+                    ref text,
+                } => {
+                    // written data quota exceeded
+                    // data transfer quota exceeded
+                    // compute time quota exceeded
+                    // logical size quota exceeded
+                    !text.contains("quota exceeded")
+                        && !text.contains("the limit for current plan reached")
+                }
+                _ => false,
            }
        }
    }
@@ -494,7 +509,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        self.metrics
            .semaphore_acquire_seconds
            .observe(now.elapsed().as_secs_f64());
-        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
+
        Ok(WakeComputePermit { permit: permit? })
    }

--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -94,14 +94,12 @@ impl Api {
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
-                // TODO(anna): retry
-                Err(e) => {
-                    if e.get_reason().is_not_found() {
+                Err(e) => match e.http_status_code() {
+                    Some(http::StatusCode::NOT_FOUND) => {
                        return Ok(AuthInfo::default());
-                    } else {
-                        return Err(e.into());
                    }
-                }
+                    _otherwise => return Err(e.into()),
+                },
            };

            let secret = if body.role_secret.is_empty() {
@@ -330,24 +328,19 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
        info!("request succeeded, processing the body");
        return Ok(response.json().await?);
    }
-    let s = response.bytes().await?;
-    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
-    info!("response_error plaintext: {:?}", s);

    // Don't throw an error here because it's not as important
    // as the fact that the request itself has failed.
-    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
+    let body = response.json().await.unwrap_or_else(|e| {
        warn!("failed to parse error body: {e}");
        ConsoleError {
            error: "reason unclear (malformed error message)".into(),
-            http_status_code: status,
-            status: None,
        }
    });
-    body.http_status_code = status;

-    error!("console responded with an error ({status}): {body:?}");
-    Err(ApiError::Console(body))
+    let text = body.error;
+    error!("console responded with an error ({status}): {text}");
+    Err(ApiError::Console { status, text })
 }

 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -12,7 +12,7 @@ use crate::auth::backend::{
 };
 use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
-use crate::console::messages::{ConsoleError, MetricsAuxInfo};
+use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
@@ -484,20 +484,18 @@ impl TestBackend for TestConnectMechanism {
        match action {
            ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
            ConnectAction::WakeFail => {
-                let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::FORBIDDEN,
-                    error: "TEST".into(),
-                    status: None,
-                });
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::FORBIDDEN,
+                    text: "TEST".into(),
+                };
                assert!(!err.could_retry());
                Err(console::errors::WakeComputeError::ApiError(err))
            }
            ConnectAction::WakeRetry => {
-                let err = console::errors::ApiError::Console(ConsoleError {
-                    http_status_code: http::StatusCode::BAD_REQUEST,
-                    error: "TEST".into(),
-                    status: None,
-                });
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::BAD_REQUEST,
+                    text: "TEST".into(),
+                };
                assert!(err.could_retry());
                Err(console::errors::WakeComputeError::ApiError(err))
            }
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,5 +1,4 @@
 use crate::config::RetryConfig;
-use crate::console::messages::ConsoleError;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{
@@ -89,76 +88,36 @@ fn report_error(e: &WakeComputeError, retry: bool) {
    let kind = match e {
        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
-        WakeComputeError::ApiError(ApiError::Console(e)) => match e.get_reason() {
-            crate::console::messages::Reason::RoleProtected => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::ResourceNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::ProjectNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::EndpointNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::BranchNotFound => {
-                WakeupFailureKind::ApiConsoleBadRequest
-            }
-            crate::console::messages::Reason::RateLimitExceeded => {
-                WakeupFailureKind::ApiConsoleLocked
-            }
-            crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::ActiveTimeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::ComputeTimeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::WrittenDataQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::DataTransferQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::LogicalSizeQuotaExceeded => {
-                WakeupFailureKind::QuotaExceeded
-            }
-            crate::console::messages::Reason::Unknown => match e {
-                ConsoleError {
-                    http_status_code: StatusCode::LOCKED,
-                    ref error,
-                    ..
-                } if error.contains("written data quota exceeded")
-                    || error.contains("the limit for current plan reached") =>
-                {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ConsoleError {
-                    http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
-                    ref error,
-                    ..
-                } if error.contains("compute time quota of non-primary branches is exceeded") => {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ConsoleError {
-                    http_status_code: StatusCode::LOCKED,
-                    ..
-                } => WakeupFailureKind::ApiConsoleLocked,
-                ConsoleError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    ..
-                } => WakeupFailureKind::ApiConsoleBadRequest,
-                ConsoleError {
-                    http_status_code, ..
-                } if http_status_code.is_server_error() => {
-                    WakeupFailureKind::ApiConsoleOtherServerError
-                }
-                ConsoleError { .. } => WakeupFailureKind::ApiConsoleOtherError,
-            },
-        },
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            WakeupFailureKind::QuotaExceeded
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::UNPROCESSABLE_ENTITY,
+            ref text,
+        }) if text.contains("compute time quota of non-primary branches is exceeded") => {
+            WakeupFailureKind::QuotaExceeded
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => WakeupFailureKind::ApiConsoleLocked,
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => WakeupFailureKind::ApiConsoleBadRequest,
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            WakeupFailureKind::ApiConsoleOtherServerError
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => {
+            WakeupFailureKind::ApiConsoleOtherError
+        }
        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
        WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
    };
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -1,3 +1,5 @@
+use std::usize;
+
 use super::{LimitAlgorithm, Outcome, Sample};

 /// Loss-based congestion avoidance.
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -32,6 +32,8 @@ pub struct ClientFirstMessage<'a> {
    pub bare: &'a str,
    /// Channel binding mode.
    pub cbind_flag: ChannelBinding<&'a str>,
+    /// (Client username)[<https://github.com/postgres/postgres/blob/94226d4506e66d6e7cbf/src/backend/libpq/auth-scram.c#L13>].
+    pub username: &'a str,
    /// Client nonce.
    pub nonce: &'a str,
 }
@@ -56,14 +58,6 @@ impl<'a> ClientFirstMessage<'a> {

        // In theory, these might be preceded by "reserved-mext" (i.e. "m=")
        let username = parts.next()?.strip_prefix("n=")?;
-
-        // https://github.com/postgres/postgres/blob/f83908798f78c4cafda217ca875602c88ea2ae28/src/backend/libpq/auth-scram.c#L13-L14
-        if !username.is_empty() {
-            tracing::warn!(username, "scram username provided, but is not expected")
-            // TODO(conrad):
-            // return None;
-        }
-
        let nonce = parts.next()?.strip_prefix("r=")?;

        // Validate but ignore auth extensions
@@ -72,6 +66,7 @@ impl<'a> ClientFirstMessage<'a> {
        Some(Self {
            bare,
            cbind_flag,
+            username,
            nonce,
        })
    }
@@ -193,18 +188,19 @@ mod tests {

        // (Almost) real strings captured during debug sessions
        let cases = [
-            (NotSupportedClient, "n,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
-            (NotSupportedServer, "y,,n=,r=t8JwklwKecDLwSsA72rHmVju"),
+            (NotSupportedClient, "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
+            (NotSupportedServer, "y,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju"),
            (
                Required("tls-server-end-point"),
-                "p=tls-server-end-point,,n=,r=t8JwklwKecDLwSsA72rHmVju",
+                "p=tls-server-end-point,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju",
            ),
        ];

        for (cb, input) in cases {
            let msg = ClientFirstMessage::parse(input).unwrap();

-            assert_eq!(msg.bare, "n=,r=t8JwklwKecDLwSsA72rHmVju");
+            assert_eq!(msg.bare, "n=pepe,r=t8JwklwKecDLwSsA72rHmVju");
+            assert_eq!(msg.username, "pepe");
            assert_eq!(msg.nonce, "t8JwklwKecDLwSsA72rHmVju");
            assert_eq!(msg.cbind_flag, cb);
        }
@@ -212,13 +208,14 @@ mod tests {

    #[test]
    fn parse_client_first_message_with_invalid_gs2_authz() {
-        assert!(ClientFirstMessage::parse("n,authzid,n=,r=nonce").is_none())
+        assert!(ClientFirstMessage::parse("n,authzid,n=user,r=nonce").is_none())
    }

    #[test]
    fn parse_client_first_message_with_extra_params() {
-        let msg = ClientFirstMessage::parse("n,,n=,r=nonce,a=foo,b=bar,c=baz").unwrap();
-        assert_eq!(msg.bare, "n=,r=nonce,a=foo,b=bar,c=baz");
+        let msg = ClientFirstMessage::parse("n,,n=user,r=nonce,a=foo,b=bar,c=baz").unwrap();
+        assert_eq!(msg.bare, "n=user,r=nonce,a=foo,b=bar,c=baz");
+        assert_eq!(msg.username, "user");
        assert_eq!(msg.nonce, "nonce");
        assert_eq!(msg.cbind_flag, ChannelBinding::NotSupportedClient);
    }
@@ -226,9 +223,9 @@ mod tests {
    #[test]
    fn parse_client_first_message_with_extra_params_invalid() {
        // must be of the form `<ascii letter>=<...>`
-        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,abc=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,1=foo").is_none());
-        assert!(ClientFirstMessage::parse("n,,n=,r=nonce,a").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,abc=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,1=foo").is_none());
+        assert!(ClientFirstMessage::parse("n,,n=user,r=nonce,a").is_none());
    }

    #[test]
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.79.0"
+channel = "1.78.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/storage_scrubber/Cargo.toml
+++ b/storage_scrubber/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_scrubber"
+name = "s3_scrubber"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
--- a/storage_scrubber/README.md
+++ b/storage_scrubber/README.md
@@ -1,4 +1,4 @@
-# Neon Storage Scrubber
+# Neon S3 scrubber

 This tool directly accesses the S3 buckets used by the Neon `pageserver`
 and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist.
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,11 +1,11 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
-use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
-use storage_scrubber::pageserver_physical_gc::GcMode;
-use storage_scrubber::scan_pageserver_metadata::scan_metadata;
-use storage_scrubber::tenant_snapshot::SnapshotDownloader;
-use storage_scrubber::{
+use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
+use s3_scrubber::pageserver_physical_gc::GcMode;
+use s3_scrubber::scan_pageserver_metadata::scan_metadata;
+use s3_scrubber::tenant_snapshot::SnapshotDownloader;
+use s3_scrubber::{
    init_logging, pageserver_physical_gc::pageserver_physical_gc,
    scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind,
    TraversingDepth,
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -40,6 +40,7 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
+scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true

--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -0,0 +1,57 @@
+use std::{borrow::Cow, fmt::Debug, fmt::Display};
+
+use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
+
+#[derive(Copy, Clone)]
+pub(crate) struct Drain {
+    pub(crate) node_id: NodeId,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) struct Fill {
+    pub(crate) node_id: NodeId,
+}
+
+#[derive(Copy, Clone)]
+pub(crate) enum Operation {
+    Drain(Drain),
+    Fill(Fill),
+}
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum OperationError {
+    #[error("Node state changed during operation: {0}")]
+    NodeStateChanged(Cow<'static, str>),
+    #[error("Operation cancelled")]
+    Cancelled,
+}
+
+pub(crate) struct OperationHandler {
+    pub(crate) operation: Operation,
+    #[allow(unused)]
+    pub(crate) cancel: CancellationToken,
+}
+
+impl Display for Drain {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "drain {}", self.node_id)
+    }
+}
+
+impl Display for Fill {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "fill {}", self.node_id)
+    }
+}
+
+impl Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Operation::Drain(op) => write!(f, "{op}"),
+            Operation::Fill(op) => write!(f, "{op}"),
+        }
+    }
+}
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -480,6 +480,39 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    )
 }

+async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    let node_status = state.service.get_node(node_id).await?;
+
+    json_response(StatusCode::OK, node_status)
+}
+
+async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.start_node_drain(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let state = get_state(&req);
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+
+    state.service.start_node_fill(node_id).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 async fn handle_tenant_shard_split(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -832,6 +865,16 @@ pub fn make_router(
                RequestName("control_v1_node_config"),
            )
        })
+        .get("/control/v1/node/:node_id", |r| {
+            named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
+        })
+        .put("/control/v1/node/:node_id/drain", |r| {
+            named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
+        })
+        .put("/control/v1/node/:node_id/fill", |r| {
+            named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
+        })
+        // TODO(vlad): endpoint for cancelling drain and fill
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
            tenant_service_handler(
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -2,6 +2,7 @@ use serde::Serialize;
 use utils::seqwait::MonotonicCounter;

 mod auth;
+mod background_node_operations;
 mod compute_hook;
 mod heartbeater;
 pub mod http;
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -59,6 +59,10 @@ impl Node {
        self.id
    }

+    pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
+        self.scheduling
+    }
+
    pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
        self.scheduling = scheduling
    }
@@ -141,6 +145,7 @@ impl Node {
            NodeSchedulingPolicy::Draining => MaySchedule::No,
            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
            NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
        }
    }

@@ -157,7 +162,7 @@ impl Node {
            listen_http_port,
            listen_pg_addr,
            listen_pg_port,
-            scheduling: NodeSchedulingPolicy::Filling,
+            scheduling: NodeSchedulingPolicy::Active,
            availability: NodeAvailability::Offline,
            cancel: CancellationToken::new(),
        }
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -442,13 +442,15 @@ impl Persistence {
    #[tracing::instrument(skip_all, fields(node_id))]
    pub(crate) async fn re_attach(
        &self,
-        node_id: NodeId,
+        input_node_id: NodeId,
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
+        use crate::schema::nodes::dsl::scheduling_policy;
+        use crate::schema::nodes::dsl::*;
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
                let rows_updated = diesel::update(tenant_shards)
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
                    .set(generation.eq(generation + 1))
                    .execute(conn)?;

@@ -457,9 +459,23 @@ impl Persistence {
                // TODO: UPDATE+SELECT in one query

                let updated = tenant_shards
-                    .filter(generation_pageserver.eq(node_id.0 as i64))
+                    .filter(generation_pageserver.eq(input_node_id.0 as i64))
                    .select(TenantShardPersistence::as_select())
                    .load(conn)?;
+
+                // If the node went through a drain and restart phase before re-attaching,
+                // then reset it's node scheduling policy to active.
+                diesel::update(nodes)
+                    .filter(node_id.eq(input_node_id.0 as i64))
+                    .filter(
+                        scheduling_policy
+                            .eq(String::from(NodeSchedulingPolicy::PauseForRestart))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
+                            .or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
+                    )
+                    .set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
+                    .execute(conn)?;
+
                Ok(updated)
            })
            .await?;
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -1,4 +1,5 @@
 use crate::{node::Node, tenant_shard::TenantShard};
+use itertools::Itertools;
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -283,6 +284,44 @@ impl Scheduler {
        }
    }

+    // Check if the number of shards attached to a given node is lagging below
+    // the cluster average. If that's the case, the node should be filled.
+    pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
+        let Some(node) = self.nodes.get(&node_id) else {
+            debug_assert!(false);
+            tracing::error!("Scheduler missing node {node_id}");
+            return 0;
+        };
+        assert!(!self.nodes.is_empty());
+        let expected_attached_shards_per_node = self.expected_attached_shard_count();
+
+        for (node_id, node) in self.nodes.iter() {
+            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
+        }
+
+        if node.attached_shard_count < expected_attached_shards_per_node {
+            expected_attached_shards_per_node - node.attached_shard_count
+        } else {
+            0
+        }
+    }
+
+    pub(crate) fn expected_attached_shard_count(&self) -> usize {
+        let total_attached_shards: usize =
+            self.nodes.values().map(|n| n.attached_shard_count).sum();
+
+        assert!(!self.nodes.is_empty());
+        total_attached_shards / self.nodes.len()
+    }
+
+    pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
+        self.nodes
+            .iter()
+            .map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
+            .sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
+            .collect()
+    }
+
    pub(crate) fn node_upsert(&mut self, node: &Node) {
        use std::collections::hash_map::Entry::*;
        match self.nodes.entry(node.get_id()) {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -8,13 +8,17 @@ use std::{
 };

 use crate::{
+    background_node_operations::{
+        Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
+    },
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
    persistence::{AbortShardSplitStatus, TenantFilter},
    reconciler::{ReconcileError, ReconcileUnits},
-    scheduler::{ScheduleContext, ScheduleMode},
+    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
-        MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
+        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
+        ScheduleOptimizationAction,
    },
 };
 use anyhow::Context;
@@ -134,6 +138,8 @@ struct ServiceState {

    scheduler: Scheduler,

+    ongoing_operation: Option<OperationHandler>,
+
    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
    delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
 }
@@ -185,6 +191,7 @@ impl ServiceState {
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
+            ongoing_operation: None,
            delayed_reconcile_rx,
        }
    }
@@ -296,6 +303,17 @@ impl From<ReconcileWaitError> for ApiError {
    }
 }

+impl From<OperationError> for ApiError {
+    fn from(value: OperationError) -> Self {
+        match value {
+            OperationError::NodeStateChanged(err) => {
+                ApiError::InternalServerError(anyhow::anyhow!(err))
+            }
+            OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
+        }
+    }
+}
+
 #[allow(clippy::large_enum_variant)]
 enum TenantCreateOrUpdate {
    Create(TenantCreateRequest),
@@ -1562,15 +1580,32 @@ impl Service {
        // Setting a node active unblocks any Reconcilers that might write to the location config API,
        // but those requests will not be accepted by the node until it has finished processing
        // the re-attach response.
+        //
+        // Additionally, reset the nodes scheduling policy to match the conditional update done
+        // in [`Persistence::re_attach`].
        if let Some(node) = nodes.get(&reattach_req.node_id) {
-            if !node.is_available() {
+            let reset_scheduling = matches!(
+                node.get_scheduling(),
+                NodeSchedulingPolicy::PauseForRestart
+                    | NodeSchedulingPolicy::Draining
+                    | NodeSchedulingPolicy::Filling
+            );
+
+            if !node.is_available() || reset_scheduling {
                let mut new_nodes = (**nodes).clone();
                if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    if !node.is_available() {
+                        node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
+                    }
+
+                    if reset_scheduling {
+                        node.set_scheduling(NodeSchedulingPolicy::Active);
+                    }
+
                    scheduler.node_upsert(node);
+                    let new_nodes = Arc::new(new_nodes);
+                    *nodes = new_nodes;
                }
-                let new_nodes = Arc::new(new_nodes);
-                *nodes = new_nodes;
            }
        }

@@ -1851,6 +1886,25 @@ impl Service {
        Ok(())
    }

+    /// Same as [`Service::await_waiters`], but returns the waiters which are still
+    /// in progress
+    async fn await_waiters_remainder(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+        timeout: Duration,
+    ) -> Vec<ReconcilerWaiter> {
+        let deadline = Instant::now().checked_add(timeout).unwrap();
+        for waiter in waiters.iter() {
+            let timeout = deadline.duration_since(Instant::now());
+            let _ = waiter.wait_timeout(timeout).await;
+        }
+
+        waiters
+            .into_iter()
+            .filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress))
+            .collect::<Vec<_>>()
+    }
+
    /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
    /// and transform it into either a tenant creation of a series of shard updates.
    ///
@@ -2409,17 +2463,11 @@ impl Service {
            (detach_waiters, shard_ids, node.clone())
        };

-        // This reconcile wait can fail in a few ways:
-        //  A there is a very long queue for the reconciler semaphore
-        //  B some pageserver is failing to handle a detach promptly
-        //  C some pageserver goes offline right at the moment we send it a request.
-        //
-        // A and C are transient: the semaphore will eventually become available, and once a node is marked offline
-        // the next attempt to reconcile will silently skip detaches for an offline node and succeed.  If B happens,
-        // it's a bug, and needs resolving at the pageserver level (we shouldn't just leave attachments behind while
-        // deleting the underlying data).
-        self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
-            .await?;
+        if let Err(e) = self.await_waiters(detach_waiters, RECONCILE_TIMEOUT).await {
+            // Failing to detach shouldn't hold up deletion, e.g. if a node is offline we should be able
+            // to use some other node to run the remote deletion.
+            tracing::warn!("Failed to detach some locations: {e}");
+        }

        let locations = shard_ids
            .into_iter()
@@ -2437,11 +2485,13 @@ impl Service {
        for result in results {
            match result {
                Ok(StatusCode::ACCEPTED) => {
-                    // This should never happen: we waited for detaches to finish above
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "Unexpectedly still attached on {}",
+                    // This could happen if we failed detach above, and hit a pageserver where the tenant
+                    // is still attached: it will accept the deletion in the background
+                    tracing::warn!(
+                        "Unexpectedly still attached on {}, client should retry",
                        node
-                    )));
+                    );
+                    return Ok(StatusCode::ACCEPTED);
                }
                Ok(_) => {}
                Err(mgmt_api::Error::Cancelled) => {
@@ -4132,6 +4182,18 @@ impl Service {
        Ok(nodes)
    }

+    pub(crate) async fn get_node(&self, node_id: NodeId) -> Result<Node, ApiError> {
+        self.inner
+            .read()
+            .unwrap()
+            .nodes
+            .get(&node_id)
+            .cloned()
+            .ok_or(ApiError::NotFound(
+                format!("Node {node_id} not registered").into(),
+            ))
+    }
+
    pub(crate) async fn node_register(
        &self,
        register_req: NodeRegisterRequest,
@@ -4286,9 +4348,6 @@ impl Service {

        if let Some(scheduling) = scheduling {
            node.set_scheduling(scheduling);
-
-            // TODO: once we have a background scheduling ticker for fill/drain, kick it
-            // to wake up and start working.
        }

        // Update the scheduler, in case the elegibility of the node for new shards has changed
@@ -4363,7 +4422,7 @@ impl Service {
                // TODO: in the background, we should balance work back onto this pageserver
            }
            AvailabilityTransition::Unchanged => {
-                tracing::debug!("Node {} no change during config", node_id);
+                tracing::debug!("Node {} no availability change during config", node_id);
            }
        }

@@ -4372,6 +4431,176 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn start_node_drain(
+        self: &Arc<Self>,
+        node_id: NodeId,
+    ) -> Result<(), ApiError> {
+        let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+            let schedulable_nodes_count = nodes
+                .iter()
+                .filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                .count();
+
+            (
+                locked
+                    .ongoing_operation
+                    .as_ref()
+                    .map(|ongoing| ongoing.operation),
+                node.is_available(),
+                node.get_scheduling(),
+                schedulable_nodes_count,
+            )
+        };
+
+        if let Some(ongoing) = ongoing_op {
+            return Err(ApiError::PreconditionFailed(
+                format!("Background operation already ongoing for node: {}", ongoing).into(),
+            ));
+        }
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if schedulable_nodes_count == 0 {
+            return Err(ApiError::PreconditionFailed(
+                "No other schedulable nodes to drain to".into(),
+            ));
+        }
+
+        match node_policy {
+            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
+                    .await?;
+
+                let cancel = CancellationToken::new();
+
+                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
+                    operation: Operation::Drain(Drain { node_id }),
+                    cancel: cancel.clone(),
+                });
+
+                tokio::task::spawn({
+                    let service = self.clone();
+                    let cancel = cancel.clone();
+                    async move {
+                        scopeguard::defer! {
+                            let prev = service.inner.write().unwrap().ongoing_operation.take();
+
+                            if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) {
+                                assert_eq!(removed_drain.node_id, node_id, "We always take the same operation");
+                            } else {
+                                panic!("We always remove the same operation")
+                            }
+                        }
+                        service.drain_node(node_id, cancel).await
+                    }
+                });
+            }
+            NodeSchedulingPolicy::Draining => {
+                return Err(ApiError::Conflict(format!(
+                    "Node {node_id} has drain in progress"
+                )));
+            }
+            policy => {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Node {node_id} cannot be drained due to {policy:?} policy").into(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
+        let (ongoing_op, node_available, node_policy, total_nodes_count) = {
+            let locked = self.inner.read().unwrap();
+            let nodes = &locked.nodes;
+            let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
+                anyhow::anyhow!("Node {} not registered", node_id).into(),
+            ))?;
+
+            (
+                locked
+                    .ongoing_operation
+                    .as_ref()
+                    .map(|ongoing| ongoing.operation),
+                node.is_available(),
+                node.get_scheduling(),
+                nodes.len(),
+            )
+        };
+
+        if let Some(ongoing) = ongoing_op {
+            return Err(ApiError::PreconditionFailed(
+                format!("Background operation already ongoing for node: {}", ongoing).into(),
+            ));
+        }
+
+        if !node_available {
+            return Err(ApiError::ResourceUnavailable(
+                format!("Node {node_id} is currently unavailable").into(),
+            ));
+        }
+
+        if total_nodes_count <= 1 {
+            return Err(ApiError::PreconditionFailed(
+                "No other nodes to fill from".into(),
+            ));
+        }
+
+        match node_policy {
+            NodeSchedulingPolicy::Active => {
+                self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
+                    .await?;
+
+                let cancel = CancellationToken::new();
+
+                self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
+                    operation: Operation::Fill(Fill { node_id }),
+                    cancel: cancel.clone(),
+                });
+
+                tokio::task::spawn({
+                    let service = self.clone();
+                    let cancel = cancel.clone();
+                    async move {
+                        scopeguard::defer! {
+                            let prev = service.inner.write().unwrap().ongoing_operation.take();
+
+                            if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) {
+                                assert_eq!(removed_fill.node_id, node_id, "We always take the same operation");
+                            } else {
+                                panic!("We always remove the same operation")
+                            }
+                        }
+
+                        service.fill_node(node_id, cancel).await
+                    }
+                });
+            }
+            NodeSchedulingPolicy::Filling => {
+                return Err(ApiError::Conflict(format!(
+                    "Node {node_id} has fill in progress"
+                )));
+            }
+            policy => {
+                return Err(ApiError::PreconditionFailed(
+                    format!("Node {node_id} cannot be filled due to {policy:?} policy").into(),
+                ));
+            }
+        }
+
+        Ok(())
+    }
+
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
@@ -4956,4 +5185,268 @@ impl Service {
        // to complete.
        self.gate.close().await;
    }
+
+    /// Drain a node by moving the shards attached to it as primaries.
+    /// This is a long running operation and it should run as a separate Tokio task.
+    pub(crate) async fn drain_node(
+        &self,
+        node_id: NodeId,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        tracing::info!(%node_id, "Starting drain background operation");
+
+        let mut last_inspected_shard: Option<TenantShardId> = None;
+        let mut inspected_all_shards = false;
+        let mut waiters = Vec::new();
+        let mut schedule_context = ScheduleContext::default();
+
+        while !inspected_all_shards {
+            if cancel.is_cancelled() {
+                return Err(OperationError::Cancelled);
+            }
+
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;
+
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
+                }
+
+                let mut cursor = tenants.iter_mut().skip_while({
+                    let skip_past = last_inspected_shard;
+                    move |(tid, _)| match skip_past {
+                        Some(last) => **tid != last,
+                        None => false,
+                    }
+                });
+
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    let (tid, tenant_shard) = match cursor.next() {
+                        Some(some) => some,
+                        None => {
+                            inspected_all_shards = true;
+                            break;
+                        }
+                    };
+
+                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
+                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                            Err(e) => {
+                                tracing::warn!(%tid, "Scheduling error when draining pageserver {} : {e}", node_id);
+                            }
+                            Ok(()) => {
+                                let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
+                                if let Some(some) = waiter {
+                                    waiters.push(some);
+                                }
+                            }
+                        }
+                    }
+
+                    last_inspected_shard = Some(*tid);
+                }
+            }
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        while !waiters.is_empty() {
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        // At this point we have done the best we could to drain shards from this node.
+        // Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]`
+        // to complete the drain.
+        if let Err(err) = self
+            .node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart))
+            .await
+        {
+            // This is not fatal. Anything that is polling the node scheduling policy to detect
+            // the end of the drain operations will hang, but all such places should enforce an
+            // overall timeout. The scheduling policy will be updated upon node re-attach and/or
+            // by the counterpart fill operation.
+            tracing::warn!(%node_id, "Failed to finalise drain by setting scheduling policy: {err}");
+        }
+
+        tracing::info!(%node_id, "Completed drain background operation");
+
+        Ok(())
+    }
+
+    /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
+    /// 1. The node should be filled until it reaches the expected cluster average of
+    /// attached shards. If there are not enough secondaries on the node, the plan stops early.
+    /// 2. Select tenant shards to promote such that the number of attached shards is balanced
+    /// throughout the cluster. We achieve this by picking tenant shards from each node,
+    /// starting from the ones with the largest number of attached shards, until the node
+    /// reaches the expected cluster average.
+    fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
+        let mut locked = self.inner.write().unwrap();
+        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
+
+        let mut tids_by_node = locked
+            .tenants
+            .iter_mut()
+            .filter_map(|(tid, tenant_shard)| {
+                if tenant_shard.intent.get_secondary().contains(&node_id) {
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        return Some((*primary, *tid));
+                    }
+                }
+
+                None
+            })
+            .into_group_map();
+
+        let expected_attached = locked.scheduler.expected_attached_shard_count();
+        let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
+
+        let mut plan = Vec::new();
+        for (node_id, attached) in nodes_by_load {
+            if plan.len() >= fill_requirement
+                || tids_by_node.is_empty()
+                || attached <= expected_attached
+            {
+                break;
+            }
+
+            let can_take = attached - expected_attached;
+            let mut remove_node = false;
+            for _ in 0..can_take {
+                match tids_by_node.get_mut(&node_id) {
+                    Some(tids) => match tids.pop() {
+                        Some(tid) => {
+                            plan.push(tid);
+                        }
+                        None => {
+                            remove_node = true;
+                            break;
+                        }
+                    },
+                    None => {
+                        break;
+                    }
+                }
+            }
+
+            if remove_node {
+                tids_by_node.remove(&node_id);
+            }
+        }
+
+        plan
+    }
+
+    /// Fill a node by promoting its secondaries until the cluster is balanced
+    /// with regards to attached shard counts. Note that this operation only
+    /// makes sense as a counterpart to the drain implemented in [`Service::drain_node`].
+    /// This is a long running operation and it should run as a separate Tokio task.
+    pub(crate) async fn fill_node(
+        &self,
+        node_id: NodeId,
+        cancel: CancellationToken,
+    ) -> Result<(), OperationError> {
+        // TODO(vlad): Currently this operates on the assumption that all
+        // secondaries are warm. This is not always true (e.g. we just migrated the
+        // tenant). Take that into consideration by checking the secondary status.
+
+        tracing::info!(%node_id, "Starting fill background operation");
+
+        let mut tids_to_promote = self.fill_node_plan(node_id);
+
+        let mut waiters = Vec::new();
+        let mut schedule_context = ScheduleContext::default();
+
+        // Execute the plan we've composed above. Before aplying each move from the plan,
+        // we validate to ensure that it has not gone stale in the meantime.
+        while !tids_to_promote.is_empty() {
+            if cancel.is_cancelled() {
+                return Err(OperationError::Cancelled);
+            }
+
+            {
+                let mut locked = self.inner.write().unwrap();
+                let (nodes, tenants, scheduler) = locked.parts_mut();
+
+                let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
+                    format!("node {node_id} was removed").into(),
+                ))?;
+
+                let current_policy = node.get_scheduling();
+                if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
+                    // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
+                    // about it
+                    return Err(OperationError::NodeStateChanged(
+                        format!("node {node_id} changed state to {current_policy:?}").into(),
+                    ));
+                }
+
+                while waiters.len() < MAX_RECONCILES_PER_OPERATION {
+                    if let Some(tid) = tids_to_promote.pop() {
+                        if let Some(tenant_shard) = tenants.get_mut(&tid) {
+                            // If the node being filled is not a secondary anymore,
+                            // skip the promotion.
+                            if !tenant_shard.intent.get_secondary().contains(&node_id) {
+                                continue;
+                            }
+
+                            tenant_shard.intent.promote_attached(scheduler, node_id);
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    tracing::warn!(%tid, "Scheduling error when filling pageserver {} : {e}", node_id);
+                                }
+                                Ok(()) => {
+                                    if let Some(waiter) =
+                                        self.maybe_reconcile_shard(tenant_shard, nodes)
+                                    {
+                                        waiters.push(waiter);
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        while !waiters.is_empty() {
+            waiters = self
+                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await;
+        }
+
+        if let Err(err) = self
+            .node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
+            .await
+        {
+            // This isn't a huge issue since the filling process starts upon request. However, it
+            // will prevent the next drain from starting. The only case in which this can fail
+            // is database unavailability. Such a case will require manual intervention.
+            tracing::error!(%node_id, "Failed to finalise fill by setting scheduling policy: {err}");
+        }
+
+        tracing::info!(%node_id, "Completed fill background operation");
+
+        Ok(())
+    }
 }
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -10,7 +10,9 @@ use crate::{
    reconciler::ReconcileUnits,
    scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
 };
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use pageserver_api::controller_api::{
+    NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
+};
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -311,6 +313,12 @@ pub(crate) struct ReconcilerWaiter {
    seq: Sequence,
 }

+pub(crate) enum ReconcilerStatus {
+    Done,
+    Failed,
+    InProgress,
+}
+
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum ReconcileWaitError {
    #[error("Timeout waiting for shard {0}")]
@@ -373,6 +381,16 @@ impl ReconcilerWaiter {

        Ok(())
    }
+
+    pub(crate) fn get_status(&self) -> ReconcilerStatus {
+        if self.seq_wait.would_wait_for(self.seq).is_err() {
+            ReconcilerStatus::Done
+        } else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
+            ReconcilerStatus::Failed
+        } else {
+            ReconcilerStatus::InProgress
+        }
+    }
 }

 /// Having spawned a reconciler task, the tenant shard's state will carry enough
@@ -652,13 +670,17 @@ impl TenantShard {
        let mut scores = all_pageservers
            .iter()
            .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
+                let node = nodes.get(node_id);
+                if node.is_none() {
+                    None
+                } else if matches!(
+                    node.unwrap().get_scheduling(),
+                    NodeSchedulingPolicy::Filling
                ) {
+                    // If the node is currently filling, don't count it as a candidate to avoid,
+                    // racing with the background fill.
+                    None
+                } else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
                    None
                } else {
                    let affinity_score = schedule_context.get_node_affinity(*node_id);
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -833,7 +833,7 @@ class NeonEnvBuilder:
    def enable_scrub_on_exit(self):
        """
        Call this if you would like the fixture to automatically run
-        storage_scrubber at the end of the test, as a bidirectional test
+        s3_scrubber at the end of the test, as a bidirectional test
        that the scrubber is working properly, and that the code within
        the test didn't produce any invalid remote state.
        """
@@ -948,7 +948,7 @@ class NeonEnvBuilder:

            if self.scrub_on_exit:
                try:
-                    StorageScrubber(self).scan_metadata()
+                    S3Scrubber(self).scan_metadata()
                except Exception as e:
                    log.error(f"Error during remote storage scrub: {e}")
                    cleanup_error = e
@@ -2213,6 +2213,30 @@ class NeonStorageController(MetricsGetter, LogUtils):
            headers=self.headers(TokenScope.ADMIN),
        )

+    def node_drain(self, node_id):
+        log.info(f"node_drain({node_id})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def node_fill(self, node_id):
+        log.info(f"node_fill({node_id})")
+        self.request(
+            "PUT",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+
+    def node_status(self, node_id):
+        response = self.request(
+            "GET",
+            f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return response.json()
+
    def node_list(self):
        response = self.request(
            "GET",
@@ -3937,7 +3961,7 @@ class Safekeeper(LogUtils):
        wait_until(20, 0.5, paused)


-class StorageScrubber:
+class S3Scrubber:
    def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None):
        self.env = env
        self.log_dir = log_dir or env.test_output_dir
@@ -3957,7 +3981,7 @@ class StorageScrubber:
        if s3_storage.endpoint is not None:
            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})

-        base_args = [str(self.env.neon_binpath / "storage_scrubber")]
+        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
        args = base_args + args

        (output_path, stdout, status_code) = subprocess_capture(
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -94,6 +94,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
    ".*WARN.*path=/v1/utilization .*request was dropped before completing",
    # Can happen during shutdown
    ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Can happen during shutdown
+    ".*ignoring failure to find gc cutoffs: timeline shutting down.*",
 )


--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple

@@ -34,6 +35,10 @@ from performance.pageserver.util import (
@pytest.mark.timeout(
    10000
 )  # TODO: this value is just "a really high number"; have this per instance type
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724",
+)
 def test_pageserver_max_throughput_getpage_at_latest_lsn(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
@@ -86,14 +91,6 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
        n_tenants,
        setup_wrapper,
    )
-
-    env.pageserver.allowed_errors.append(
-        # https://github.com/neondatabase/neon/issues/6925
-        # https://github.com/neondatabase/neon/issues/6390
-        # https://github.com/neondatabase/neon/issues/6724
-        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
-    )
-
    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)


--- a/test_runner/performance/pgvector/halfvec_build.sql
+++ b/test_runner/performance/pgvector/halfvec_build.sql
@@ -1,15 +0,0 @@
-DROP TABLE IF EXISTS halfvec_test_table;
-
-CREATE TABLE halfvec_test_table (
-    _id text NOT NULL,
-    title text,
-    text text,
-    embeddings halfvec(1536),
-    PRIMARY KEY (_id)
-);
-
-INSERT INTO halfvec_test_table (_id, title, text, embeddings)
-SELECT _id, title, text, embeddings::halfvec
-FROM documents;
-
-CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);
--- a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql
@@ -1,13 +0,0 @@
-- run with pooled connection
-- pgbench -T 300 -c 100 -j20 -f pgbench_halfvec_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
-
-with x (x) as (
-  select "embeddings" as x
-  from halfvec_test_table 
-  TABLESAMPLE SYSTEM (1) 
-  LIMIT 1
-)
-SELECT title, "embeddings" <=> (select x from x) as distance
-FROM halfvec_test_table
-ORDER BY 2
-LIMIT 30;
--- a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
+++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql
@@ -0,0 +1,13 @@
+-- run with pooled connection
+-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
+
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -106,7 +106,6 @@ QUERIES: Tuple[LabelledQuery, ...] = (
 # Disable auto formatting for the list of queries so that it's easier to read
 # fmt: off
 PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
-    LabelledQuery("PGVPREP",  r"ALTER EXTENSION VECTOR UPDATE;"),
    LabelledQuery("PGV0",  r"DROP TABLE IF EXISTS hnsw_test_table;"),
    LabelledQuery("PGV1",  r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"),
    LabelledQuery("PGV2",  r"INSERT INTO hnsw_test_table SELECT * FROM documents;"),
@@ -116,10 +115,6 @@ PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = (
    LabelledQuery("PGV6",  r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"),
    LabelledQuery("PGV7",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"),
    LabelledQuery("PGV8",  r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"),
-    LabelledQuery("PGV9",  r"DROP TABLE IF EXISTS halfvec_test_table;"),
-    LabelledQuery("PGV10", r"CREATE TABLE halfvec_test_table (_id text NOT NULL, title text, text text, embeddings halfvec(1536), PRIMARY KEY (_id));"),
-    LabelledQuery("PGV11", r"INSERT INTO halfvec_test_table (_id, title, text, embeddings) SELECT _id, title, text, embeddings::halfvec FROM documents;"),
-    LabelledQuery("PGV12", r"CREATE INDEX documents_half_precision_hnsw_idx ON halfvec_test_table USING hnsw (embeddings halfvec_cosine_ops) WITH (m = 64, ef_construction = 128);"),
 )
 # fmt: on

--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -18,7 +18,6 @@ class PgBenchLoadType(enum.Enum):
    SIMPLE_UPDATE = "simple-update"
    SELECT_ONLY = "select-only"
    PGVECTOR_HNSW = "pgvector-hnsw"
-    PGVECTOR_HALFVEC = "pgvector-halfvec"


 def utc_now_timestamp() -> int:
@@ -154,26 +153,6 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P
            password=password,
        )

-    if workload_type == PgBenchLoadType.PGVECTOR_HALFVEC:
-        # Run simple-update workload
-        run_pgbench(
-            env,
-            "pgvector-halfvec",
-            [
-                "pgbench",
-                "-f",
-                "test_runner/performance/pgvector/pgbench_custom_script_pgvector_halfvec_queries.sql",
-                "-c100",
-                "-j20",
-                f"-T{duration}",
-                "-P2",
-                "--protocol=prepared",
-                "--progress-timestamp",
-                connstr,
-            ],
-            password=password,
-        )
-
    env.report_size()


@@ -243,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur
@pytest.mark.remote_cluster
 def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
+
+
+# The following test runs on an existing database that has pgvector extension installed
+# and a table with 1 million embedding vectors loaded and indexed with HNSW.
+#
+# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int):
+    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)
--- a/test_runner/performance/test_perf_pgvector_queries.py
+++ b/test_runner/performance/test_perf_pgvector_queries.py
@@ -1,24 +0,0 @@
-import pytest
-from fixtures.compare_fixtures import PgCompare
-
-from performance.test_perf_pgbench import PgBenchLoadType, get_durations_matrix, run_test_pgbench
-
-
-# The following test runs on an existing database that has pgvector extension installed
-# and a table with 1 million embedding vectors loaded and indexed with HNSW.
-#
-# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
-@pytest.mark.parametrize("duration", get_durations_matrix())
-@pytest.mark.remote_cluster
-def test_pgbench_remote_pgvector_hnsw(remote_compare: PgCompare, duration: int):
-    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)
-
-
-# The following test runs on an existing database that has pgvector extension installed
-# and a table with 1 million embedding vectors loaded and indexed with halfvec.
-#
-# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup.
-@pytest.mark.parametrize("duration", get_durations_matrix())
-@pytest.mark.remote_cluster
-def test_pgbench_remote_pgvector_halfvec(remote_compare: PgCompare, duration: int):
-    run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HALFVEC)
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -300,7 +300,7 @@ def test_replica_query_race(neon_simple_env: NeonEnv):
            p_cur.execute("CREATE TABLE test AS SELECT 0 AS counter")

    standby_ep = env.endpoints.new_replica_start(origin=primary_ep, endpoint_id="standby")
-    wait_replica_caughtup(primary_ep, standby_ep)
+    time.sleep(1)

    # In primary, run a lot of UPDATEs on a single page
    finished = False
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -221,35 +221,6 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))


-def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("init")
-    endpoint = env.endpoints.create_start("init")
-
-    with endpoint.connect().cursor() as cur:
-        cur.execute("create table wal_generator (id serial primary key, data text)")
-        cur.execute(
-            "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')"
-        )
-        cur.execute(
-            """
-INSERT INTO wal_generator (data)
-SELECT repeat('A', 1024) -- Generates a kilobyte of data per row
-FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data
-"""
-        )
-
-    endpoint.stop_and_destroy()
-    endpoint = env.endpoints.create_start("init")
-
-    with endpoint.connect().cursor() as cur:
-        cur.execute(
-            "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')"
-        )
-
-
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
 def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
    neon_env_builder.num_safekeepers = 3
@@ -276,7 +247,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
    connstr = endpoint.connstr().replace("'", "''")
    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub")
    logical_replication_sync(vanilla_pg, endpoint)
-
    vanilla_pg.stop()

    # Pause the safekeepers so that they can't send WAL (except to pageserver)
--- a/test_runner/regress/test_ondemand_slru_download.py
+++ b/test_runner/regress/test_ondemand_slru_download.py
@@ -129,33 +129,3 @@ def test_ondemand_download_replica(neon_env_builder: NeonEnvBuilder, shard_count
    cur_replica = conn_replica.cursor()
    cur_replica.execute("SELECT * FROM clogtest")
    assert cur_replica.fetchall() == [(1,), (3,)]
-
-
-def test_ondemand_download_after_wal_switch(neon_env_builder: NeonEnvBuilder):
-    """
-    Test on-demand SLRU download on standby, when starting right after
-    WAL segment switch.
-
-    This is a repro for a bug in how the LSN at WAL page/segment
-    boundary was handled (https://github.com/neondatabase/neon/issues/8030)
-    """
-
-    tenant_conf = {
-        "lazy_slru_download": "true",
-    }
-    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
-
-    endpoint = env.endpoints.create_start("main")
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    # Create a test table
-    cur.execute("CREATE TABLE clogtest (id integer)")
-    cur.execute("INSERT INTO clogtest VALUES (1)")
-
-    # Start standby at WAL segment boundary
-    cur.execute("SELECT pg_switch_wal()")
-    lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_insert_lsn()"))
-    _endpoint_at_lsn = env.endpoints.create_start(
-        branch_name="main", endpoint_id="ep-at-lsn", lsn=lsn
-    )
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -22,7 +22,7 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
-    StorageScrubber,
+    S3Scrubber,
    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.common_types import parse_layer_file_name
@@ -215,7 +215,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):

    # Having written a mixture of generation-aware and legacy index_part.json,
    # ensure the scrubber handles the situation as expected.
-    metadata_summary = StorageScrubber(neon_env_builder).scan_metadata()
+    metadata_summary = S3Scrubber(neon_env_builder).scan_metadata()
    assert metadata_summary["tenant_count"] == 1  # Scrubber should have seen our timeline
    assert metadata_summary["timeline_count"] == 1
    assert metadata_summary["timeline_shard_count"] == 1
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -7,7 +7,7 @@ from typing import Any, Dict, Optional
 import pytest
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import (
    assert_prefix_empty,
@@ -214,7 +214,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
    # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check
    # that the scrubber sees it and cleans it up.  We do this before the final attach+validate pass,
    # to also validate that the scrubber isn't breaking anything.
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] > 0

@@ -536,7 +536,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    # Scrub the remote storage
    # ========================
    # This confirms that the scrubber isn't upset by the presence of the heatmap
-    StorageScrubber(neon_env_builder).scan_metadata()
+    S3Scrubber(neon_env_builder).scan_metadata()

    # Detach secondary and delete tenant
    # ===================================
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -6,7 +6,7 @@ import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
-    StorageScrubber,
+    S3Scrubber,
 )
 from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.workload import Workload
@@ -60,7 +60,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
    output_path = neon_env_builder.test_output_dir / "snapshot"
    os.makedirs(output_path)

-    scrubber = StorageScrubber(neon_env_builder)
+    scrubber = S3Scrubber(neon_env_builder)
    scrubber.tenant_snapshot(tenant_id, output_path)

    assert len(os.listdir(output_path)) > 0
@@ -143,18 +143,18 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt
        workload.write_rows(1)

    # With a high min_age, the scrubber should decline to delete anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600)
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] == 0

    # If targeting a different tenant, the scrubber shouldn't do anything
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(
        min_age_secs=1, tenant_ids=[TenantId.generate()]
    )
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] == 0

    #  With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations
-    gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
+    gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1)
    assert gc_summary["remote_storage_errors"] == 0
    assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -11,8 +11,8 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
+    S3Scrubber,
    StorageControllerApiException,
-    StorageScrubber,
    last_flush_lsn_upload,
    tenant_get_shards,
    wait_for_last_flush_lsn,
@@ -128,7 +128,7 @@ def test_sharding_smoke(

    # Check the scrubber isn't confused by sharded content, then disable
    # it during teardown because we'll have deleted by then
-    StorageScrubber(neon_env_builder).scan_metadata()
+    S3Scrubber(neon_env_builder).scan_metadata()
    neon_env_builder.scrub_on_exit = False

    env.storage_controller.pageserver_api().tenant_delete(tenant_id)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -133,9 +133,6 @@ def test_storage_controller_smoke(

    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))

-    # Let all the reconciliations after marking the node offline complete
-    env.storage_controller.reconcile_until_idle()
-
    # Marking pageserver active should not migrate anything to it
    # immediately
    env.storage_controller.node_configure(env.pageservers[0].id, {"availability": "Active"})
@@ -1480,3 +1477,120 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
        workload = Workload(env, tenant_id, timeline, branch_name=branch)
        workload.expect_rows = expect_rows
        workload.validate()
+
+
+def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
+    """
+    Graceful reststart of storage controller clusters use the drain and
+    fill hooks in order to migrate attachments away from pageservers before
+    restarting. In practice, Ansible will drive this process.
+    """
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_count = 5
+    shard_count_per_tenant = 8
+    total_shards = tenant_count * shard_count_per_tenant
+    tenant_ids = []
+
+    for _ in range(0, tenant_count):
+        tid = TenantId.generate()
+        tenant_ids.append(tid)
+        env.neon_cli.create_tenant(
+            tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
+        )
+
+    # Give things a chance to settle.
+    # A call to `reconcile_until_idle` could be used here instead,
+    # however since all attachments are placed on the same node,
+    # we'd have to wait for a long time (2 minutes-ish) for optimizations
+    # to quiesce.
+    # TODO: once the initial attachment selection is fixed, update this
+    # to use `reconcile_until_idle`.
+    time.sleep(2)
+
+    nodes = env.storage_controller.node_list()
+    assert len(nodes) == 2
+
+    def retryable_node_operation(op, ps_id, max_attempts, backoff):
+        while max_attempts > 0:
+            try:
+                op(ps_id)
+                return
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
+    def poll_node_status(node_id, desired_scheduling_policy, max_attempts, backoff):
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        while max_attempts > 0:
+            try:
+                status = env.storage_controller.node_status(node_id)
+                policy = status["scheduling"]
+                if policy == desired_scheduling_policy:
+                    return
+                else:
+                    max_attempts -= 1
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                    if max_attempts == 0:
+                        raise AssertionError(
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                        )
+
+                    time.sleep(backoff)
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
+    def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
+        # Assert that all nodes have some attached shards
+        assert len(shard_counts) == len(env.pageservers)
+
+        min_shard_count = min(shard_counts.values())
+        max_shard_count = max(shard_counts.values())
+
+        flake_factor = 5 / 100
+        assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+
+    # Perform a graceful rolling restart
+    for ps in env.pageservers:
+        retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+
+        shard_counts = get_node_shard_counts(env, tenant_ids)
+        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
+        # Assert that we've drained the node
+        assert shard_counts[str(ps.id)] == 0
+        # Assert that those shards actually went somewhere
+        assert sum(shard_counts.values()) == total_shards
+
+        ps.restart()
+        poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
+
+        retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
+
+        shard_counts = get_node_shard_counts(env, tenant_ids)
+        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
+        assert_shard_counts_balanced(env, shard_counts, total_shards)
+
+    # Now check that shards are reasonably balanced
+    shard_counts = get_node_shard_counts(env, tenant_ids)
+    log.info(f"Shard counts after rolling restart: {shard_counts}")
+    assert_shard_counts_balanced(env, shard_counts, total_shards)
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -10,7 +10,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    PgBin,
-    StorageScrubber,
+    S3Scrubber,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -707,7 +707,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)

    remote_storage_kind = RemoteStorageKind.MOCK_S3
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-    scrubber = StorageScrubber(neon_env_builder)
+    scrubber = S3Scrubber(neon_env_builder)
    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)

    ps_http = env.pageserver.http_client()
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -678,6 +678,10 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
        with pytest.raises(PageserverApiException, match=matcher):
            completion.result()

+    # this happens on both cases
+    env.pageserver.allowed_errors.append(
+        ".*ignoring failure to find gc cutoffs: timeline shutting down.*"
+    )
    # this happens only in the case of deletion (http response logging)
    env.pageserver.allowed_errors.append(".*Failed to refresh gc_info before gathering inputs.*")

--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -1,9 +1,7 @@
 import time
-from contextlib import closing

 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
-from fixtures.utils import query_scalar


 #
@@ -115,88 +113,11 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
    assert cur_new.fetchall() == []


-def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder):
-    """
-    Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record.
-
-    This is a repro for the bug fixed in commit 66fa176cc8.
-    """
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start(
-        "main",
-        config_lines=[
-            # If auto-analyze runs at the same time that we run VACUUM FREEZE, it
-            # can hold a snasphot that prevent the tuples from being frozen.
-            "autovacuum=off",
-            "log_checkpoints=on",
-        ],
-    )
-
-    # Run the tests in a dedicated database, because the activity monitor
-    # periodically runs some queries on to the 'postgres' database. If that
-    # happens at the same time that we're trying to freeze, the activity
-    # monitor's queries can hold back the xmin horizon and prevent freezing.
-    with closing(endpoint.connect()) as pg_conn:
-        pg_conn.cursor().execute("CREATE DATABASE vmbitsdb")
-    pg_conn = endpoint.connect(dbname="vmbitsdb")
-    cur = pg_conn.cursor()
-
-    # Install extension containing function needed for test
-    cur.execute("CREATE EXTENSION neon_test_utils")
-    cur.execute("CREATE EXTENSION pageinspect")
-
-    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
-    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
-    cur.execute("BEGIN")
-    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-    xid = int(query_scalar(cur, "SELECT txid_current()"))
-    cur.execute("COMMIT")
-    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock")
-    for notice in pg_conn.notices:
-        log.info(f"{notice}")
-
-    # This test has been flaky in the past, because background activity like
-    # auto-analyze and compute_ctl's activity monitor queries have prevented the
-    # tuples from being frozen. Check that they were frozen.
-    relfrozenxid = int(
-        query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'")
-    )
-    assert (
-        relfrozenxid > xid
-    ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}"
-
-    # Lock a row. This clears the all-frozen VM bit for that page.
-    cur.execute("BEGIN")
-    cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
-    cur.execute("COMMIT")
-
-    # The VM page in shared buffer cache, and the same page as reconstructed by
-    # the pageserver, should be equal. Except for the LSN: Clearing a bit in the
-    # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN
-    # when it replays the VM-bit clearing record (since commit 387a36874c)
-    #
-    # This is a bit fragile, we've had lot of flakiness in this test before. For
-    # example, because all the VM bits were not set because concurrent
-    # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or
-    # because autoavacuum kicked in and re-froze the page between the
-    # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now,
-    # which should make this deterministic.
-    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
-    vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex()
-    cur.execute(
-        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
-    )
-    vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex()
-
-    assert vm_page_at_pageserver == vm_page_in_cache
-
-
-def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
-    """
-    The previous test is enough to verify the bug that was fixed in
-    commit 66fa176cc8. But for good measure, we also reproduce the
-    original problem that the missing VM page update caused.
-    """
+#
+# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
+# record.
+#
+def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder):
    tenant_conf = {
        "checkpoint_distance": f"{128 * 1024}",
        "compaction_target_size": f"{128 * 1024}",
@@ -209,9 +130,9 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)

    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
+    timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock")
    endpoint = env.endpoints.create_start(
-        "main",
+        "test_vm_bit_clear_on_heap_lock",
        config_lines=[
            "log_autovacuum_min_duration = 0",
            # Perform anti-wraparound vacuuming aggressively
@@ -225,10 +146,12 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):

    # Install extension containing function needed for test
    cur.execute("CREATE EXTENSION neon_test_utils")
+    cur.execute("CREATE EXTENSION pageinspect")

    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
+
    cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock")

    # Lock a row. This clears the all-frozen VM bit for that page.
@@ -242,6 +165,27 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):

    cur.execute("COMMIT")

+    # The VM page in shared buffer cache, and the same page as reconstructed
+    # by the pageserver, should be equal.
+    #
+    # Ignore page header (24 bytes) of visibility map.
+    # If the dirty VM page is flushed from the cache for some reason,
+    # it gets WAL-logged, which changes the LSN on the page.
+    # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page.
+    cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )")
+    vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex()
+    cur.execute(
+        "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )"
+    )
+    vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex()
+
+    assert vm_page_at_pageserver == vm_page_in_cache
+
+    # The above assert is enough to verify the bug that was fixed in
+    # commit 66fa176cc8. But for good measure, we also reproduce the
+    # original problem that the missing VM page update caused. The
+    # rest of the test does that.
+
    # Kill and restart postgres, to clear the buffer cache.
    #
    # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -601,16 +601,13 @@ async def run_segment_init_failure(env: NeonEnv):
    conn = await ep.connect_async()
    ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
    # next insertion should hang until failpoint is disabled.
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1,1), 'payload'")
-    )
+    asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
    sleep_sec = 2
    await asyncio.sleep(sleep_sec)
+    # also restart ep at segment boundary to make test more interesting
+    ep.stop()
    # it must still be not finished
-    assert not bg_query.done()
-    # Also restart ep at segment boundary to make test more interesting. Do it in immediate mode;
-    # fast will hang because it will try to gracefully finish sending WAL.
-    ep.stop(mode="immediate")
+    # assert not bg_query.done()
    # Without segment rename during init (#6402) previous statement created
    # partially initialized 16MB segment, so sk restart also triggers #6401.
    sk.stop().start()
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"],
-  "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"],
-  "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"]
+  "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"],
+  "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"],
+  "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"]
 }
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -18,7 +18,7 @@ commands:
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
  - name: sql-exporter
    user: nobody
    sysvInitAction: respawn
@@ -93,7 +93,7 @@ files:
      target:
        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'

        # Collectors (referenced by name) to execute on the target.
        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
@@ -128,7 +128,7 @@ files:
      target:
        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
+        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'

        # Collectors (referenced by name) to execute on the target.
        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
@@ -304,9 +304,7 @@ files:
          - slot_name
        values: [restart_lsn]
        query: |
-          select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-          from pg_replication_slots
-          where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';

      - metric_name: retained_wal
        type: gauge
Author	SHA1	Message	Date
Vlad Lazar	fce602ed30	review: remove Controller entity	2024-06-17 16:23:14 +01:00
Vlad Lazar	fc24ba5233	review: add drain/fill_node comments	2024-06-17 16:23:14 +01:00
Vlad Lazar	e200a2b01e	review: lift fill plan to a separate function	2024-06-17 16:23:11 +01:00
Vlad Lazar	2a7f224306	review: remove superflous sequence number bump	2024-06-17 11:46:39 +01:00
Vlad Lazar	d86ddf2b76	review: get_node_status -> get_node	2024-06-17 11:45:49 +01:00
Vlad Lazar	86d5f4ada9	review: kick_waiters -> await_waiters_remainder	2024-06-17 11:44:39 +01:00
Vlad Lazar	089edb55e8	tests: add storcon graceful restarts smoke test	2024-06-13 18:51:02 +01:00
Vlad Lazar	1302f9442a	storcon: add node status endpoint	2024-06-13 18:51:02 +01:00
Vlad Lazar	80612d2688	storcon: reset transient node policies on re-attach	2024-06-13 18:51:02 +01:00
Vlad Lazar	7f96ac3435	storcon: change default scheduling policy to Active	2024-06-13 18:51:02 +01:00
Vlad Lazar	999fbbb2a3	storcon: disallow attachment optimisations for nodes in filling state	2024-06-13 18:51:02 +01:00
Vlad Lazar	d22e0b5398	storcon: plug drain and fill operations to the controller	2024-06-13 18:51:02 +01:00
Vlad Lazar	58340f9dbf	storcon: add node fill algorithm	2024-06-13 18:50:58 +01:00
Vlad Lazar	fcbac527b0	storcon: add node drain algorithm	2024-06-13 18:42:52 +01:00
Vlad Lazar	a5154cf990	storcon: add util for kicking a set of waiters repeatedly	2024-06-11 16:03:52 +01:00
Vlad Lazar	bfe5df8c4e	storcon: add PauseForRestart node scheduling policy	2024-06-11 16:03:52 +01:00
Vlad Lazar	46927bc228	storcon: expose node scheduling policy	2024-06-11 16:03:52 +01:00
Vlad Lazar	bb9c792813	storcon: add background node operations controller skeleton	2024-06-11 16:03:52 +01:00