Polish API handler and refresh OpenAPI spec

Use Condvar and make configuration API blocking
Allow starting compute_ctl without spec
2026-05-20 22:50:38 +00:00 · 2023-04-05 23:35:35 +03:00 · 2023-04-05 23:19:10 +03:00 · 2023-04-05 22:09:43 +03:00 · 2023-04-05 21:31:44 +03:00 · 2023-04-05 20:04:14 +03:00
157 changed files with 4521 additions and 1663 deletions
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -15,10 +15,32 @@ outputs:
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}

 runs:
  using: "composite"
+
  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set common environment variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        echo "BUILD_TYPE=${BUILD_TYPE}"   >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"           >> $GITHUB_ENV
+        echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
+      env:
+        BUILD_TYPE: ${{ inputs.build_type }}
+        BUCKET: neon-github-public-dev
+        TEST_OUTPUT: /tmp/test_output
+
    - name: Validate input parameters
      shell: bash -euxo pipefail {0}
      run: |
@@ -76,16 +98,14 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.19.0
-        ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
+        ALLURE_VERSION: 2.21.0
+        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c

    - name: Upload Allure results
      if: ${{ inputs.action == 'store' }}
      env:
        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      shell: bash -euxo pipefail {0}
      run: |
@@ -104,7 +124,7 @@ runs:
        EOF
        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${{ inputs.build_type }}
+          BUILD_TYPE=${BUILD_TYPE}
        EOF

        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
@@ -113,13 +133,12 @@ runs:
        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"

-    # Potentially we could have several running build for the same key (for example for the main branch),  so we use improvised lock for this
+    # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
    - name: Acquire Allure lock
      if: ${{ inputs.action == 'generate' }}
      shell: bash -euxo pipefail {0}
      env:
        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        LOCK_TIMEOUT=300 # seconds
@@ -149,8 +168,6 @@ runs:
      env:
        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_OUTPUT: /tmp/test_output
-        BUCKET: neon-github-public-dev
      shell: bash -euxo pipefail {0}
      run: |
        # Get previously uploaded data for this run
@@ -186,24 +203,24 @@ runs:
        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html

        # Generate redirect
-        cat <<EOF > ./index.html
+        cat <<EOF > ${TEST_OUTPUT}/allure/index.html
          <!DOCTYPE html>

          <meta charset="utf-8">
          <title>Redirecting to ${REPORT_URL}</title>
          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
        EOF
-        aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+        aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT

    - name: Release Allure lock
      if: ${{ inputs.action == 'generate' && always() }}
      shell: bash -euxo pipefail {0}
      env:
        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        BUCKET: neon-github-public-dev
        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
@@ -212,11 +229,16 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${TEST_OUTPUT}/allure
+
    - uses: actions/github-script@v6
      if: ${{ inputs.action == 'generate' && always() }}
      env:
        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        BUILD_TYPE: ${{ inputs.build_type }}
        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
      with:
        script: |
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,10 @@ inputs:
    description: 'Secret access key'
    required: false
    default: ''
+  rerun_flaky:
+    description: 'Whether to rerun flaky tests'
+    required: false
+    default: 'false'

 runs:
  using: "composite"
@@ -101,6 +105,7 @@ runs:
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
+        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
      shell: bash -euxo pipefail {0}
      run: |
        # PLATFORM will be embedded in the perf test report
@@ -143,6 +148,13 @@ runs:
          EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
        fi

+        if [ "${RERUN_FLAKY}" == "true" ]; then
+          mkdir -p $TEST_OUTPUT
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"
+
+          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        fi
+
        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -8,6 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -8,6 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -8,6 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -8,6 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
+        min_avail_bytes: 0
+        period: "10s"
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "10m"
+          threshold: &default_eviction_threshold "24h"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -8,11 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 80
+        min_avail_bytes: 0
+        period: "10s"
      tenant_config:
        eviction_policy:
          kind: "LayerAccessThreshold"
          period: "20m"
-          threshold: "20m"
+          threshold: &default_eviction_threshold "20m"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -8,11 +8,16 @@ storage:
      pg_distrib_dir: /usr/local
      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
      metric_collection_interval: 10min
+      disk_usage_based_eviction:
+        max_usage_pct: 80
+        min_avail_bytes: 0
+        period: "10s"
      tenant_config:
        eviction_policy:
          kind: "LayerAccessThreshold"
          period: "20m"
-          threshold: "20m"
+          threshold: &default_eviction_threshold "20m"
+      evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
      remote_storage:
        bucket_name: "{{ bucket_name }}"
        bucket_region: "{{ bucket_region }}"
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -3,8 +3,12 @@
 ## Issue ticket number and link

 ## Checklist before requesting a review
+
 - [ ] I have performed a self-review of my code.
 - [ ] If it is a core feature, I have added thorough tests.
 - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
 - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.

+## Checklist before merging
+
+- [ ] Do not forget to reformat commit message to not include the above checklist
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -335,6 +335,9 @@ jobs:
          real_s3_region: us-west-2
          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          rerun_flaky: true
+        env:
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug'
@@ -371,42 +374,88 @@ jobs:
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

-  merge-allure-report:
+  create-test-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
    if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      matrix:
-        build_type: [ debug, release ]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: false

-      - name: Create Allure report
-        id: create-allure-report
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Create Allure report (debug)
+        if: ${{ !cancelled() }}
+        id: create-allure-report-debug
        uses: ./.github/actions/allure-report
        with:
          action: generate
-          build_type: ${{ matrix.build_type }}
+          build_type: debug
+
+      - name: Create Allure report (release)
+        if: ${{ !cancelled() }}
+        id: create-allure-report-release
+        uses: ./.github/actions/allure-report
+        with:
+          action: generate
+          build_type: release
+
+      - uses: actions/github-script@v6
+        if: >
+          !cancelled() &&
+          github.event_name == 'pull_request' && (
+            steps.create-allure-report-debug.outputs.report-url ||
+            steps.create-allure-report-release.outputs.report-url
+          )
+        with:
+          script: |
+            const reports = [{
+              buildType: "debug",
+              reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
+              jsonUrl:   "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
+            }, {
+              buildType: "release",
+              reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
+              jsonUrl:   "${{ steps.create-allure-report-release.outputs.report-json-url }}",
+            }]
+
+            const script = require("./scripts/pr-comment-test-report.js")
+            await script({
+              github,
+              context,
+              fetch,
+              reports,
+            })

      - name: Store Allure test stat in the DB
-        if: ${{ steps.create-allure-report.outputs.report-url }}
+        if: >
+          !cancelled() && (
+            steps.create-allure-report-debug.outputs.report-url ||
+            steps.create-allure-report-release.outputs.report-url
+          )
        env:
-          BUILD_TYPE: ${{ matrix.build_type }}
          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }}
+          REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
+          REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
        run: |
-          curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
          ./scripts/pysync

-          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+          for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
+            if [ -z "$report_url" ]; then
+              continue
+            fi
+
+            if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
+              BUILD_TYPE=debug
+            else
+              BUILD_TYPE=release
+            fi
+
+            curl --fail --output suites.json "${report_url}"
+            DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
+          done

  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
@@ -898,6 +947,16 @@ jobs:
    needs: [ push-docker-hub, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
      - name: Checkout
        uses: actions/checkout@v3
        with:
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,14 +53,14 @@ jobs:
        uses: actions/cache@v3
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v3
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Set extra env for macOS
        run: |
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -841,6 +841,18 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "compute_api"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "chrono",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "workspace_hack",
+]
+
 [[package]]
 name = "compute_tools"
 version = "0.1.0"
@@ -848,6 +860,7 @@ dependencies = [
 "anyhow",
 "chrono",
 "clap 4.1.4",
+ "compute_api",
 "futures",
 "hyper",
 "notify",
@@ -2474,6 +2487,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "svg_fmt",
+ "sync_wrapper",
 "tempfile",
 "tenant_size_model",
 "thiserror",
@@ -4556,6 +4570,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "rand",
+ "regex",
 "routerify",
 "sentry",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -132,6 +132,7 @@ tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df6
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending

 ## Local libraries
+compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
--- a/README.md
+++ b/README.md
@@ -40,6 +40,8 @@ pacman -S base-devel readline zlib libseccomp openssl clang \
 postgresql-libs cmake postgresql protobuf
 ```

+Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
+
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
 # recommended approach from https://www.rust-lang.org/tools/install
@@ -145,15 +147,15 @@ Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

 # start postgres compute node
-> ./target/debug/neon_local pg start main
-Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+> ./target/debug/neon_local endpoint start main
+Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
-Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
+Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
-> ./target/debug/neon_local pg list
- NODE  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
- main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
+> ./target/debug/neon_local endpoint list
+ ENDPOINT  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
+ main      127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

 2. Now, it is possible to connect to postgres and run some queries:
@@ -182,14 +184,14 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/neon_local pg start migration_check --branch-name migration_check
-Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
+> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check
+Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
-Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
+Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'

 # check the new list of running postgres instances
-> ./target/debug/neon_local pg list
- NODE             ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
+> ./target/debug/neon_local endpoint list
+ ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
 migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,4 +27,5 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 url.workspace = true

+compute_api.workspace = true
 workspace_hack.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -34,22 +34,24 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{Arc, RwLock};
+use std::sync::{Arc, Condvar, Mutex};
 use std::{thread, time::Duration};

-use anyhow::{Context, Result};
+use anyhow::{anyhow, Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use tracing::{error, info};
+use url::Url;

-use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
+use compute_api::models::{ComputeMetrics, ComputeState, ComputeStatus};
+
+use compute_tools::compute::{ComputeNode, ComputeNodeInner, ParsedSpec};
+use compute_tools::configurator::launch_configurator;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
-use compute_tools::pg_helpers::*;
-use compute_tools::spec::*;
-use url::Url;
+use compute_tools::spec::get_spec_from_control_plane;

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -62,7 +64,7 @@ fn main() -> Result<()> {
    let connstr = matches
        .get_one::<String>("connstr")
        .expect("Postgres connection string is required");
-    let spec = matches.get_one::<String>("spec");
+    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

    let compute_id = matches.get_one::<String>("compute-id");
@@ -71,40 +73,97 @@ fn main() -> Result<()> {
    // Try to use just 'postgres' if no path is provided
    let pgbin = matches.get_one::<String>("pgbin").unwrap();

-    let spec: ComputeSpec = match spec {
+    let mut spec = None;
+    let mut live_config_allowed = false;
+    match spec_json {
        // First, try to get cluster spec from the cli argument
-        Some(json) => serde_json::from_str(json)?,
+        Some(json) => {
+            spec = Some(serde_json::from_str(json)?);
+        }
        None => {
            // Second, try to read it from the file if path is provided
            if let Some(sp) = spec_path {
                let path = Path::new(sp);
                let file = File::open(path)?;
-                serde_json::from_reader(file)?
+                spec = Some(serde_json::from_reader(file)?);
            } else if let Some(id) = compute_id {
                if let Some(cp_base) = control_plane_uri {
-                    let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
-                    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
-                        Ok(v) => v,
-                        Err(_) => "".to_string(),
-                    };
-
-                    reqwest::blocking::Client::new()
-                        .get(cp_uri)
-                        .header("Authorization", jwt)
-                        .send()?
-                        .json()?
+                    live_config_allowed = true;
+                    if let Ok(s) = get_spec_from_control_plane(cp_base, id) {
+                        spec = Some(s);
+                    }
                } else {
-                    panic!(
-                        "must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
-                        control_plane_uri, compute_id
-                    );
+                    panic!("must specify both --control-plane-uri and --compute-id or none");
                }
            } else {
-                panic!("compute spec should be provided via --spec or --spec-path argument");
+                panic!(
+                    "compute spec should be provided by one of the following ways: \
+                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
+                );
            }
        }
    };

+    // Volatile compute state under mutex and condition variable to notify everyone
+    // who is interested in the state changes.
+    let compute_node = ComputeNode {
+        start_time: Utc::now(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        pgdata: pgdata.to_string(),
+        pgbin: pgbin.to_string(),
+        live_config_allowed,
+        inner: Mutex::new(ComputeNodeInner {
+            state: ComputeState {
+                status: ComputeStatus::Empty,
+                last_active: Utc::now(),
+                error: None,
+            },
+            spec: None,
+            metrics: ComputeMetrics::default(),
+        }),
+        state_changed: Condvar::new()
+    };
+
+    // If we have a spec already, go immediately into Init state.
+    let spec_set = spec.is_some();
+    if let Some(spec) = spec {
+        let mut inner = compute_node.inner.lock().unwrap();
+
+        let parsed_spec = ParsedSpec::try_from(spec)
+            .map_err(|msg| anyhow!("error parsing compute spec: {msg}"))?;
+        inner.spec = Some(parsed_spec);
+        inner.state.status = ComputeStatus::Init;
+    }
+
+    let compute = Arc::new(compute_node);
+
+    // Launch http service first, so we were able to serve control-plane
+    // requests, while configuration is still in progress.
+    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+
+    if !spec_set {
+        // No spec was provided earlier, hang waiting for it.
+        info!("no compute spec provided, waiting");
+
+        let mut inner = compute.inner.lock().unwrap();
+        while inner.state.status != ComputeStatus::ConfigurationPending {
+            inner = compute.state_changed.wait(inner).unwrap();
+
+            if inner.state.status == ComputeStatus::ConfigurationPending {
+                info!("got spec, continue configuration");
+                // Spec is already set by the http server handler.
+                inner.state.status = ComputeStatus::Init;
+                break;
+            }
+        }
+    };
+
+    // We got the spec. Start up
+    let startup_tracing_context = {
+        let inner = compute.inner.lock().unwrap();
+        inner.spec.as_ref().unwrap().spec.startup_tracing_context.clone()
+    };
+
    // Extract OpenTelemetry context for the startup actions from the spec, and
    // attach it to the current tracing context.
    //
@@ -120,7 +179,7 @@ fn main() -> Result<()> {
    // postgres is configured and up-and-running, we exit this span. Any other
    // actions that are performed on incoming HTTP requests, for example, are
    // performed in separate spans.
-    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+    let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
        use opentelemetry::propagation::TextMapPropagator;
        use opentelemetry::sdk::propagation::TraceContextPropagator;
        Some(TraceContextPropagator::new().extract(carrier).attach())
@@ -128,42 +187,10 @@ fn main() -> Result<()> {
        None
    };

-    let pageserver_connstr = spec
-        .cluster
-        .settings
-        .find("neon.pageserver_connstring")
-        .expect("pageserver connstr should be provided");
-    let storage_auth_token = spec.storage_auth_token.clone();
-    let tenant = spec
-        .cluster
-        .settings
-        .find("neon.tenant_id")
-        .expect("tenant id should be provided");
-    let timeline = spec
-        .cluster
-        .settings
-        .find("neon.timeline_id")
-        .expect("tenant id should be provided");
-
-    let compute_state = ComputeNode {
-        start_time: Utc::now(),
-        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
-        pgdata: pgdata.to_string(),
-        pgbin: pgbin.to_string(),
-        spec,
-        tenant,
-        timeline,
-        pageserver_connstr,
-        storage_auth_token,
-        metrics: ComputeMetrics::default(),
-        state: RwLock::new(ComputeState::new()),
-    };
-    let compute = Arc::new(compute_state);
-
-    // Launch service threads first, so we were able to serve availability
-    // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
+    let _configurator_handle =
+        launch_configurator(&compute).expect("cannot launch configurator thread");

    // Start Postgres
    let mut delay_exit = false;
@@ -172,10 +199,10 @@ fn main() -> Result<()> {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
-            let mut state = compute.state.write().unwrap();
-            state.error = Some(format!("{:?}", err));
-            state.status = ComputeStatus::Failed;
-            drop(state);
+            let mut inner = compute.inner.lock().unwrap();
+            inner.state.error = Some(format!("{:?}", err));
+            inner.state.status = ComputeStatus::Failed;
+            drop(inner);
            delay_exit = true;
            None
        }
@@ -262,7 +289,7 @@ fn cli() -> clap::Command {
            Arg::new("control-plane-uri")
                .short('p')
                .long("control-plane-uri")
-                .value_name("CONTROL_PLANE"),
+                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -19,16 +19,17 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::sync::RwLock;
+use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use serde::{Serialize, Serializer};
 use tokio_postgres;
 use tracing::{info, instrument, warn};

+use compute_api::models::{ComputeMetrics, ComputeState, ComputeStatus};
+use compute_api::spec::ComputeSpec;
+
 use crate::checker::create_writability_check_data;
 use crate::config;
 use crate::pg_helpers::*;
@@ -41,74 +42,92 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    // We only allow live re- / configuration of the compute node if
+    // it uses 'pull model', i.e. it can go to control-plane and fetch
+    // the latest configuration. Otherwise, there could be a case:
+    // - we start compute with some spec provided as argument
+    // - we push new spec and it does reconfiguration
+    // - but then something happens and compute pod / VM is destroyed,
+    //   so k8s controller starts it again with the **old** spec
+    pub live_config_allowed: bool,
+
+    /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
+    /// Coupled with `Condvar` to allow notifying HTTP API and configurator
+    /// thread about state changes. To allow HTTP API server to serving status
+    /// requests, while configuration is in progress, lock should be held only
+    /// for short periods of time to do read/write, not the whole configuration
+    /// process.
+    pub inner: Mutex<ComputeNodeInner>,
+    pub state_changed: Condvar,
+}
+
+pub struct ComputeNodeInner {
+    pub state: ComputeState,
+
+    pub spec: Option<ParsedSpec>,
+
+    pub metrics: ComputeMetrics,
+}
+
+#[derive(Clone)]
+pub struct ParsedSpec {
    pub spec: ComputeSpec,
+
+    // extra fields extracted from 'spec'.
    pub tenant: String,
    pub timeline: String,
    pub pageserver_connstr: String,
    pub storage_auth_token: Option<String>,
-    pub metrics: ComputeMetrics,
-    /// Volatile part of the `ComputeNode` so should be used under `RwLock`
-    /// to allow HTTP API server to serve status requests, while configuration
-    /// is in progress.
-    pub state: RwLock<ComputeState>,
 }

-fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
-where
-    S: Serializer,
-{
-    x.to_rfc3339().serialize(s)
-}
+impl TryFrom<ComputeSpec> for ParsedSpec {
+    type Error = String;

-#[derive(Serialize)]
-#[serde(rename_all = "snake_case")]
-pub struct ComputeState {
-    pub status: ComputeStatus,
-    /// Timestamp of the last Postgres activity
-    #[serde(serialize_with = "rfc3339_serialize")]
-    pub last_active: DateTime<Utc>,
-    pub error: Option<String>,
-}
+    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
+        let pageserver_connstr = spec
+            .cluster
+            .settings
+            .find("neon.pageserver_connstring")
+            .ok_or("pageserver connstr should be provided")?;
+        let storage_auth_token = spec.storage_auth_token.clone();
+        let tenant = spec
+            .cluster
+            .settings
+            .find("neon.tenant_id")
+            .ok_or("tenant id should be provided")?;
+        let timeline = spec
+            .cluster
+            .settings
+            .find("neon.timeline_id")
+            .ok_or("tenant id should be provided")?;

-impl ComputeState {
-    pub fn new() -> Self {
-        Self {
-            status: ComputeStatus::Init,
-            last_active: Utc::now(),
-            error: None,
-        }
+        Ok(ParsedSpec {
+            spec,
+            pageserver_connstr,
+            storage_auth_token,
+            tenant,
+            timeline,
+        })
    }
 }

-impl Default for ComputeState {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeStatus {
-    Init,
-    Running,
-    Failed,
-}
-
-#[derive(Default, Serialize)]
-pub struct ComputeMetrics {
-    pub sync_safekeepers_ms: AtomicU64,
-    pub basebackup_ms: AtomicU64,
-    pub config_ms: AtomicU64,
-    pub total_startup_ms: AtomicU64,
-}
-
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
-        self.state.write().unwrap().status = status;
+        let mut inner = self.inner.lock().unwrap();
+        inner.state.status = status;
+        self.state_changed.notify_all();
    }

    pub fn get_status(&self) -> ComputeStatus {
-        self.state.read().unwrap().status
+        self.inner.lock().unwrap().state.status
+    }
+
+    pub fn get_state(&self) -> ComputeState {
+        self.inner.lock().unwrap().state.clone()
+    }
+
+    pub fn get_metrics(&self) -> ComputeMetrics {
+        self.inner.lock().unwrap().metrics.clone()
    }

    // Remove `pgdata` directory and create it again with right permissions.
@@ -124,15 +143,15 @@ impl ComputeNode {

    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip(self))]
-    fn get_basebackup(&self, lsn: &str) -> Result<()> {
+    #[instrument(skip(self, spec))]
+    fn get_basebackup(&self, spec: &ParsedSpec, lsn: &str) -> Result<()> {
        let start_time = Utc::now();

-        let mut config = postgres::Config::from_str(&self.pageserver_connstr)?;
+        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
-        if let Some(storage_auth_token) = &self.storage_auth_token {
+        if let Some(storage_auth_token) = &spec.storage_auth_token {
            info!("Got storage auth token from spec file");
            config.password(storage_auth_token);
        } else {
@@ -141,8 +160,8 @@ impl ComputeNode {

        let mut client = config.connect(NoTls)?;
        let basebackup_cmd = match lsn {
-            "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
-            _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
+            "0/0" => format!("basebackup {} {}", &spec.tenant, &spec.timeline), // First start of the compute
+            _ => format!("basebackup {} {} {}", &spec.tenant, &spec.timeline, lsn),
        };
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;

@@ -155,28 +174,24 @@ impl ComputeNode {
        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata)?;

-        self.metrics.basebackup_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+        self.inner.lock().unwrap().metrics.basebackup_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
        Ok(())
    }

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip(self))]
-    fn sync_safekeepers(&self) -> Result<String> {
+    #[instrument(skip(self, storage_auth_token))]
+    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<String> {
        let start_time = Utc::now();

        let sync_handle = Command::new(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
-            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
            } else {
                vec![]
@@ -201,14 +216,11 @@ impl ComputeNode {
            );
        }

-        self.metrics.sync_safekeepers_ms.store(
-            Utc::now()
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
+        self.inner.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;

        let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());

@@ -217,29 +229,28 @@ impl ComputeNode {

    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip(self))]
-    pub fn prepare_pgdata(&self) -> Result<()> {
-        let spec = &self.spec;
+    #[instrument(skip(self, spec))]
+    fn prepare_pgdata(&self, spec: &ParsedSpec) -> Result<()> {
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec.spec)?;

        info!("starting safekeepers syncing");
        let lsn = self
-            .sync_safekeepers()
+            .sync_safekeepers(spec.storage_auth_token.clone())
            .with_context(|| "failed to sync safekeepers")?;
        info!("safekeepers synced at LSN {}", lsn);

        info!(
            "getting basebackup@{} from pageserver {}",
-            lsn, &self.pageserver_connstr
+            lsn, &spec.pageserver_connstr
        );
-        self.get_basebackup(&lsn).with_context(|| {
+        self.get_basebackup(spec, &lsn).with_context(|| {
            format!(
                "failed to get basebackup@{} from pageserver {}",
-                lsn, &self.pageserver_connstr
+                lsn, &spec.pageserver_connstr
            )
        })?;

@@ -252,13 +263,16 @@ impl ComputeNode {
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip(self))]
-    pub fn start_postgres(&self) -> Result<std::process::Child> {
+    pub fn start_postgres(
+        &self,
+        storage_auth_token: Option<String>,
+    ) -> Result<std::process::Child> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
        let mut pg = Command::new(&self.pgbin)
            .args(["-D", &self.pgdata])
-            .envs(if let Some(storage_auth_token) = &self.storage_auth_token {
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
            } else {
                vec![]
@@ -271,8 +285,9 @@ impl ComputeNode {
        Ok(pg)
    }

-    #[instrument(skip(self))]
-    pub fn apply_config(&self) -> Result<()> {
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip(self, spec))]
+    fn apply_config(&self, spec: &ParsedSpec) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
        //
@@ -303,19 +318,64 @@ impl ComputeNode {
        };

        // Proceed with post-startup configuration. Note, that order of operations is important.
-        handle_roles(&self.spec, &mut client)?;
-        handle_databases(&self.spec, &mut client)?;
-        handle_role_deletions(self, &mut client)?;
-        handle_grants(self, &mut client)?;
+        handle_roles(&spec.spec, &mut client)?;
+        handle_databases(&spec.spec, &mut client)?;
+        handle_role_deletions(&spec.spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(&spec.spec, self.connstr.as_str(), &mut client)?;
        create_writability_check_data(&mut client)?;
-        handle_extensions(&self.spec, &mut client)?;
+        handle_extensions(&spec.spec, &mut client)?;

        // 'Close' connection
        drop(client);

        info!(
            "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
+            spec.spec.cluster.cluster_id
+        );
+
+        Ok(())
+    }
+
+    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
+    // have opened connection to Postgres and superuser access.
+    #[instrument(skip(self, client))]
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
+        client.simple_query("SELECT pg_reload_conf()")?;
+        Ok(())
+    }
+
+    /// Similar to `apply_config()`, but does a bit different sequence of operations,
+    /// as it's used to reconfigure a previously started and configured Postgres node.
+    #[instrument(skip(self))]
+    pub fn reconfigure(&self) -> Result<()> {
+        let spec = {
+            let inner = self.inner.lock().unwrap();
+            inner.spec.as_ref().expect("cannot start_compute without spec").spec.clone()
+        };
+
+        // Write new config
+        let pgdata_path = Path::new(&self.pgdata);
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+
+        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        self.pg_reload_conf(&mut client)?;
+
+        // Proceed with post-startup configuration. Note, that order of operations is important.
+        handle_roles(&spec, &mut client)?;
+        handle_databases(&spec, &mut client)?;
+        handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+        handle_grants(&spec, self.connstr.as_str(), &mut client)?;
+        handle_extensions(&spec, &mut client)?;
+
+        // 'Close' connection
+        drop(client);
+
+        let unknown_op = "unknown".to_string();
+        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
+        info!(
+            "finished reconfiguration of compute node for operation {}",
+            op_id
        );

        Ok(())
@@ -323,40 +383,44 @@ impl ComputeNode {

    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
+        let spec = self
+            .inner
+            .lock()
+            .unwrap()
+            .spec
+            .as_ref()
+            .expect("cannot start_compute without spec")
+            .clone();
        info!(
            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
+            spec.spec.cluster.cluster_id,
+            spec.spec.operation_uuid.as_ref().unwrap(),
+            spec.tenant,
+            spec.timeline,
        );

-        self.prepare_pgdata()?;
+        self.prepare_pgdata(&spec)?;

        let start_time = Utc::now();

-        let pg = self.start_postgres()?;
+        let pg = self.start_postgres(spec.storage_auth_token.clone())?;

-        self.apply_config()?;
+        self.apply_config(&spec)?;

        let startup_end_time = Utc::now();
-        self.metrics.config_ms.store(
-            startup_end_time
+        {
+            let mut inner = self.inner.lock().unwrap();
+            inner.metrics.config_ms = startup_end_time
                .signed_duration_since(start_time)
                .to_std()
                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-        self.metrics.total_startup_ms.store(
-            startup_end_time
+                .as_millis() as u64;
+            inner.metrics.total_startup_ms = startup_end_time
                .signed_duration_since(self.start_time)
                .to_std()
                .unwrap()
-                .as_millis() as u64,
-            Ordering::Relaxed,
-        );
-
+                .as_millis() as u64;
+        }
        self.set_status(ComputeStatus::Running);

        Ok(pg)
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,7 +6,7 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::PgOptionsSerialize;
-use crate::spec::ComputeSpec;
+use compute_api::spec::ComputeSpec;

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -0,0 +1,53 @@
+use std::sync::Arc;
+use std::thread;
+
+use anyhow::Result;
+use tracing::{error, info, instrument};
+
+use crate::compute::ComputeNode;
+use compute_api::models::ComputeStatus;
+
+#[instrument(skip(compute))]
+fn configurator_main_loop(compute: &Arc<ComputeNode>) {
+    info!("waiting for reconfiguration requests");
+    loop {
+        let inner = compute.inner.lock().unwrap();
+        let mut inner = compute.state_changed.wait(inner).unwrap();
+
+        if inner.state.status == ComputeStatus::ConfigurationPending {
+            info!("got configuration request");
+            inner.state.status = ComputeStatus::Configuration;
+            compute.state_changed.notify_all();
+            drop(inner);
+
+            let mut new_status = ComputeStatus::Failed;
+            if let Err(e) = compute.reconfigure() {
+                error!("could not configure compute node: {}", e);
+            } else {
+                new_status = ComputeStatus::Running;
+                info!("compute node configured");
+            }
+
+            // XXX: used to test that API is blocking
+            // std::thread::sleep(std::time::Duration::from_millis(2000));
+
+            compute.set_status(new_status);
+        } else if inner.state.status == ComputeStatus::Failed {
+            info!("compute node is now in Failed state, exiting");
+            break;
+        } else {
+            info!("woken up for compute status: {:?}, sleeping", inner.state.status);
+        }
+    }
+}
+
+pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+    let compute = Arc::clone(compute);
+
+    Ok(thread::Builder::new()
+        .name("compute-configurator".into())
+        .spawn(move || {
+            configurator_main_loop(&compute);
+            info!("configurator thread is exited");
+        })?)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,7 +3,10 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

-use crate::compute::ComputeNode;
+use crate::compute::{ComputeNode, ParsedSpec};
+use crate::http::models::{ConfigurationRequest, GenericAPIError};
+use compute_api::models::ComputeStatus;
+
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
@@ -12,6 +15,44 @@ use serde_json;
 use tracing::{error, info};
 use tracing_utils::http::OtelName;

+async fn handle_spec_request(req: Request<Body>, compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    if !compute.live_config_allowed {
+        return Err(("live reconfiguration is not allowed for this compute node".to_string(), StatusCode::PRECONDITION_FAILED));
+    }
+
+    let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap();
+    let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap();
+
+    let request = serde_json::from_str::<ConfigurationRequest>(&spec_raw)
+        .map_err(|err| (format!("could not parse request json: {err}"), StatusCode::BAD_REQUEST))?;
+    let spec = ParsedSpec::try_from(request.spec)
+        .map_err(|err| (format!("could not parse spec: {err}"), StatusCode::BAD_REQUEST))?;
+
+    let mut inner = compute.inner.lock().unwrap();
+    if !(inner.state.status == ComputeStatus::Empty
+         || inner.state.status == ComputeStatus::Running)
+    {
+        return Err((format!(
+            "invalid compute status for reconfiguration request: {}",
+            serde_json::to_string(&inner.state).unwrap()
+        ), StatusCode::PRECONDITION_FAILED));
+    }
+    inner.spec = Some(spec);
+    inner.state.status = ComputeStatus::ConfigurationPending;
+    compute.state_changed.notify_all();
+    info!("set new spec and notified configurator");
+
+    while inner.state.status != ComputeStatus::Running {
+        inner = compute.state_changed.wait(inner).unwrap();
+        info!(
+            "waiting for compute to become Running, current status: {:?}",
+            inner.state.status
+        );
+    }
+    drop(inner);
+    Ok(())
+}
+
 // Service function to handle all available routes.
 async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
    //
@@ -23,26 +64,44 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
        // Serialized compute state.
        (&Method::GET, "/status") => {
            info!("serving /status GET request");
-            let state = compute.state.read().unwrap();
-            Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
+            let state = compute.get_state();
+            Response::new(Body::from(serde_json::to_string(&state).unwrap()))
        }

        // Startup metrics in JSON format. Keep /metrics reserved for a possible
        // future use for Prometheus metrics format.
        (&Method::GET, "/metrics.json") => {
            info!("serving /metrics.json GET request");
-            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
+            let metrics = compute.get_metrics();
+            Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
        }

        // Collect Postgres current usage insights
        (&Method::GET, "/insights") => {
            info!("serving /insights GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!("compute is not running, current status: {:?}", status);
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
            let insights = compute.collect_insights().await;
            Response::new(Body::from(insights))
        }

        (&Method::POST, "/check_writability") => {
            info!("serving /check_writability POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for check_writability request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
            let res = crate::checker::check_writability(compute).await;
            match res {
                Ok(_) => Response::new(Body::from("true")),
@@ -61,6 +120,24 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            ))
        }

+        // Accept spec in JSON format and request compute configuration from
+        // the configurator thread. If anything goes wrong after we set the
+        // compute state to `ConfigurationPending` and / or sent spec to the
+        // configurator thread, we basically leave compute in the potentially
+        // wrong state. That said, it's control-plane's responsibility to
+        // watch compute state after reconfiguration request and to clean
+        // restart in case of errors.
+        (&Method::POST, "/configure") => {
+            info!("serving /configure POST request");
+            match handle_spec_request(req, compute).await {
+                Ok(()) => Response::new(Body::from("ok")),
+                Err((msg, code) ) => {
+                    error!("error handling /spec request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
@@ -70,6 +147,16 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
    }
 }

+fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
+    let error = GenericAPIError {
+        error: e.to_string(),
+    };
+    Response::builder()
+        .status(status)
+        .body(Body::from(serde_json::to_string(&error).unwrap()))
+        .unwrap()
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(state: Arc<ComputeNode>) {
@@ -110,7 +197,6 @@ async fn serve(state: Arc<ComputeNode>) {
 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
 pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
    let state = Arc::clone(state);
-
    Ok(thread::Builder::new()
        .name("http-endpoint".into())
        .spawn(move || serve(state))?)
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -1 +1,2 @@
 pub mod api;
+pub mod models;
--- a/compute_tools/src/http/models.rs
+++ b/compute_tools/src/http/models.rs
@@ -0,0 +1,16 @@
+use serde::{Deserialize, Serialize};
+
+use compute_api::spec::ComputeSpec;
+
+/// We now pass only `spec` in the configuration request, but later we can
+/// extend it and something like `restart: bool` or something else. So put
+/// `spec` into a struct initially to be more flexible in the future.
+#[derive(Deserialize, Debug)]
+pub struct ConfigurationRequest {
+    pub spec: ComputeSpec,
+}
+
+#[derive(Serialize, Debug)]
+pub struct GenericAPIError {
+    pub error: String,
+}
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -11,7 +11,7 @@ paths:
    get:
      tags:
      - Info
-      summary: Get compute node internal status
+      summary: Get compute node internal status.
      description: ""
      operationId: getComputeStatus
      responses:
@@ -26,7 +26,7 @@ paths:
    get:
      tags:
      - Info
-      summary: Get compute node startup metrics in JSON format
+      summary: Get compute node startup metrics in JSON format.
      description: ""
      operationId: getComputeMetricsJSON
      responses:
@@ -41,9 +41,9 @@ paths:
    get:
      tags:
      - Info
-      summary: Get current compute insights in JSON format
+      summary: Get current compute insights in JSON format.
      description: |
-        Note, that this doesn't include any historical data
+        Note, that this doesn't include any historical data.
      operationId: getComputeInsights
      responses:
        200:
@@ -56,12 +56,12 @@ paths:
  /info:
    get:
      tags:
-      - "info"
-      summary: Get info about the compute Pod/VM
+      - Info
+      summary: Get info about the compute pod / VM.
      description: ""
      operationId: getInfo
      responses:
-        "200":
+        200:
          description: Info
          content:
            application/json:
@@ -72,7 +72,7 @@ paths:
    post:
      tags:
      - Check
-      summary: Check that we can write new data on this compute
+      summary: Check that we can write new data on this compute.
      description: ""
      operationId: checkComputeWritability
      responses:
@@ -82,9 +82,57 @@ paths:
            text/plain:
              schema:
                type: string
-                description: Error text or 'true' if check passed
+                description: Error text or 'true' if check passed.
                example: "true"

+  /configure:
+    post:
+      tags:
+      - Configure
+      summary: Request compute node configuration.
+      description: |
+        This is a blocking API endpoint, i.e. it blocks waiting until
+        compute is finished configuration and is in `Running` state.
+        Optional non-blocking mode could be added later. Currently,
+        it's also assumed that reconfiguration doesn't require restart.
+      operationId: configureCompute
+      requestBody:
+        description: Configuration request.
+        required: true
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - spec
+              properties:
+                spec:
+                  # XXX: I don't want to explain current spec in the OpenAPI format,
+                  # as it could be changed really soon. Consider doing it later.
+                  type: object
+      responses:
+        200:
+          description: Compute configuration finished.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeState"
+        400:
+          description: Provided spec is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        412:
+          description: |
+            It's not possible to do live-configuration of the compute.
+            It's either in the wrong state, or compute doesn't use pull
+            mode of configuration.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
  securitySchemes:
    JWT:
@@ -95,7 +143,7 @@ components:
  schemas:
    ComputeMetrics:
      type: object
-      description: Compute startup metrics
+      description: Compute startup metrics.
      required:
        - sync_safekeepers_ms
        - basebackup_ms
@@ -113,7 +161,7 @@ components:

    Info:
      type: object
-      description: Information about VM/Pod
+      description: Information about VM/Pod.
      required:
        - num_cpus
      properties:
@@ -130,17 +178,26 @@ components:
          $ref: '#/components/schemas/ComputeStatus'
        last_active:
          type: string
-          description: The last detected compute activity timestamp in UTC and RFC3339 format
+          description: The last detected compute activity timestamp in UTC and RFC3339 format.
          example: "2022-10-12T07:20:50.52Z"
        error:
          type: string
-          description: Text of the error during compute startup, if any
+          description: Text of the error during compute startup, if any.
+          example: ""
+        tenant:
+          type: string
+          description: Identifier of the current tenant served by compute node, if any.
+          example: c9269c359e9a199fad1ea0981246a78f
+        timeline:
+          type: string
+          description: Identifier of the current timeline served by compute node, if any.
+          example: ece7de74d4b8cbe5433a68ce4d1b97b4

    ComputeInsights:
      type: object
      properties:
        pg_stat_statements:
-          description: Contains raw output from pg_stat_statements in JSON format
+          description: Contains raw output from pg_stat_statements in JSON format.
          type: array
          items:
            type: object
@@ -151,6 +208,19 @@ components:
        - init
        - failed
        - running
+      example: running
+
+    #
+    # Errors
+    #
+
+    GenericError:
+      type: object
+      required:
+        - error
+      properties:
+        error:
+          type: string

 security:
  - JWT: []
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -4,6 +4,7 @@
 //!
 pub mod checker;
 pub mod config;
+pub mod configurator;
 pub mod http;
 #[macro_use]
 pub mod logger;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
                        &[],
                    );
-                let mut last_active = compute.state.read().unwrap().last_active;
+                let mut last_active = compute.inner.lock().unwrap().state.last_active;

                if let Ok(backs) = backends {
                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -87,9 +87,9 @@ fn watch_compute_activity(compute: &ComputeNode) {
                }

                // Update the last activity in the shared state if we got a more recent one.
-                let mut state = compute.state.write().unwrap();
-                if last_active > state.last_active {
-                    state.last_active = last_active;
+                let mut inner = compute.inner.lock().unwrap();
+                if last_active > inner.state.last_active {
+                    inner.state.last_active = last_active;
                    debug!("set the last compute activity time to: {}", last_active);
                }
            }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -10,43 +10,12 @@ use std::time::{Duration, Instant};
 use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
-use serde::Deserialize;
 use tracing::{debug, instrument};

+use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
+
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds

-/// Rust representation of Postgres role info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Role {
-    pub name: PgIdent,
-    pub encrypted_password: Option<String>,
-    pub options: GenericOptions,
-}
-
-/// Rust representation of Postgres database info with only those fields
-/// that matter for us.
-#[derive(Clone, Deserialize)]
-pub struct Database {
-    pub name: PgIdent,
-    pub owner: PgIdent,
-    pub options: GenericOptions,
-}
-
-/// Common type representing both SQL statement params with or without value,
-/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
-/// options like `wal_level = logical`.
-#[derive(Clone, Deserialize)]
-pub struct GenericOption {
-    pub name: String,
-    pub value: Option<String>,
-    pub vartype: String,
-}
-
-/// Optional collection of `GenericOption`'s. Type alias allows us to
-/// declare a `trait` on it.
-pub type GenericOptions = Option<Vec<GenericOption>>;
-
 /// Escape a string for including it in a SQL literal
 fn escape_literal(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
@@ -58,9 +27,14 @@ fn escape_conf_value(s: &str) -> String {
    s.replace('\'', "''").replace('\\', "\\\\")
 }

-impl GenericOption {
+trait GenericOptionExt {
+    fn to_pg_option(&self) -> String;
+    fn to_pg_setting(&self) -> String;
+}
+
+impl GenericOptionExt for GenericOption {
    /// Represent `GenericOption` as SQL statement parameter.
-    pub fn to_pg_option(&self) -> String {
+    fn to_pg_option(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
                "string" => format!("{} '{}'", self.name, escape_literal(val)),
@@ -72,7 +46,7 @@ impl GenericOption {
    }

    /// Represent `GenericOption` as configuration option.
-    pub fn to_pg_setting(&self) -> String {
+    fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
                "string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
@@ -131,10 +105,14 @@ impl GenericOptionsSearch for GenericOptions {
    }
 }

-impl Role {
+pub trait RoleExt {
+    fn to_pg_options(&self) -> String;
+}
+
+impl RoleExt for Role {
    /// Serialize a list of role parameters into a Postgres-acceptable
    /// string of arguments.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
        // For now, we do not use generic `options` for roles. Once used, add
        // `self.options.as_pg_options()` somewhere here.
@@ -159,21 +137,17 @@ impl Role {
    }
 }

-impl Database {
-    pub fn new(name: PgIdent, owner: PgIdent) -> Self {
-        Self {
-            name,
-            owner,
-            options: None,
-        }
-    }
+pub trait DatabaseExt {
+    fn to_pg_options(&self) -> String;
+}

+impl DatabaseExt for Database {
    /// Serialize a list of database parameters into a Postgres-acceptable
    /// string of arguments.
    /// NB: `TEMPLATE` is actually also an identifier, but so far we only need
    /// to use `template0` and `template1`, so it is not a problem. Yet in the future
    /// it may require a proper quoting too.
-    pub fn to_pg_options(&self) -> String {
+    fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
        write!(params, " OWNER {}", &self.owner.pg_quote())
            .expect("String is documented to not to error during write operations");
@@ -182,10 +156,6 @@ impl Database {
    }
 }

-/// String type alias representing Postgres identifier and
-/// intended to be used for DB / role names.
-pub type PgIdent = String;
-
 /// Generic trait used to provide quoting / encoding for strings used in the
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
@@ -226,7 +196,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
            &[],
        )?
        .iter()
-        .map(|row| Database::new(row.get("datname"), row.get("owner")))
+        .map(|row| Database {
+            name: row.get("datname"),
+            owner: row.get("owner"),
+            options: None,
+        })
        .collect();

    Ok(postgres_dbs)
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,57 +1,38 @@
-use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;

 use anyhow::Result;
 use postgres::config::Config;
 use postgres::{Client, NoTls};
-use serde::Deserialize;
 use tracing::{info, info_span, instrument, span_enabled, warn, Level};

-use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

-/// Cluster spec or configuration represented as an optional number of
-/// delta operations + final cluster state description.
-#[derive(Clone, Deserialize)]
-pub struct ComputeSpec {
-    pub format_version: f32,
-    pub timestamp: String,
-    pub operation_uuid: Option<String>,
-    /// Expected cluster state at the end of transition process.
-    pub cluster: Cluster,
-    pub delta_operations: Option<Vec<DeltaOp>>,
+use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};

-    pub storage_auth_token: Option<String>,
+/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// env variable is set, it will be used for authorization.
+pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeSpec> {
+    let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
+    let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
+        Ok(v) => v,
+        Err(_) => "".to_string(),
+    };
+    info!("getting spec from control plane: {}", cp_uri);

-    pub startup_tracing_context: Option<HashMap<String, String>>,
-}
+    // TODO: check the response. We should distinguish cases when it's
+    // - network error, then retry
+    // - no spec for compute yet, then wait
+    // - compute id is unknown or any other error, then bail out
+    let spec = reqwest::blocking::Client::new()
+        .get(cp_uri)
+        .header("Authorization", jwt)
+        .send()?
+        .json()?;

-/// Cluster state seen from the perspective of the external tools
-/// like Rails web console.
-#[derive(Clone, Deserialize)]
-pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
-    pub state: Option<String>,
-    pub roles: Vec<Role>,
-    pub databases: Vec<Database>,
-    pub settings: GenericOptions,
-}
-
-/// Single cluster state changing operation that could not be represented as
-/// a static `Cluster` structure. For example:
-/// - DROP DATABASE
-/// - DROP ROLE
-/// - ALTER ROLE name RENAME TO new_name
-/// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Deserialize)]
-pub struct DeltaOp {
-    pub action: String,
-    pub name: PgIdent,
-    pub new_name: Option<PgIdent>,
+    Ok(spec)
 }

 /// It takes cluster specification and does the following:
@@ -226,8 +207,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

 /// Reassign all dependent objects and delete requested roles.
 #[instrument(skip_all)]
-pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    if let Some(ops) = &node.spec.delta_operations {
+pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
+    if let Some(ops) = &spec.delta_operations {
        // First, reassign all dependent objects to db owners.
        info!("reassigning dependent objects of to-be-deleted roles");

@@ -244,7 +225,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
            // Check that role is still present in Postgres, as this could be a
            // restart with the same spec after role deletion.
            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
-                reassign_owned_objects(node, &op.name)?;
+                reassign_owned_objects(spec, connstr, &op.name)?;
            }
        }

@@ -268,10 +249,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
 }

 // Reassign all owned objects in all databases to the owner of the database.
-fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
-    for db in &node.spec.cluster.databases {
+fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> {
+    for db in &spec.cluster.databases {
        if db.owner != *role_name {
-            let mut conf = Config::from_str(node.connstr.as_str())?;
+            let mut conf = Config::from_str(connstr)?;
            conf.dbname(&db.name);

            let mut client = conf.connect(NoTls)?;
@@ -416,9 +397,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
 #[instrument(skip_all)]
-pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
-    let spec = &node.spec;
-
+pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
    info!("cluster spec grants:");

    // We now have a separate `web_access` role to connect to the database
@@ -450,8 +429,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
    // Do some per-database access adjustments. We'd better do this at db creation time,
    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
    // atomically.
-    for db in &node.spec.cluster.databases {
-        let mut conf = Config::from_str(node.connstr.as_str())?;
+    for db in &spec.cluster.databases {
+        let mut conf = Config::from_str(connstr)?;
        conf.dbname(&db.name);

        let mut db_client = conf.connect(NoTls)?;
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -1,14 +1,13 @@
 #[cfg(test)]
 mod pg_helpers_tests {
-
    use std::fs::File;

+    use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent};
    use compute_tools::pg_helpers::*;
-    use compute_tools::spec::ComputeSpec;

    #[test]
    fn params_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
@@ -23,7 +22,7 @@ mod pg_helpers_tests {

    #[test]
    fn settings_serialize() {
-        let file = File::open("tests/cluster_spec.json").unwrap();
+        let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap();
        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();

        assert_eq!(
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -7,7 +7,7 @@
 //!
 use anyhow::{anyhow, bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
-use control_plane::compute::ComputeControlPlane;
+use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::LocalEnv;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
@@ -106,8 +106,8 @@ fn main() -> Result<()> {
            "start" => handle_start_all(sub_args, &env),
            "stop" => handle_stop_all(sub_args, &env),
            "pageserver" => handle_pageserver(sub_args, &env),
-            "pg" => handle_pg(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
+            "endpoint" => handle_endpoint(sub_args, &env),
            _ => bail!("unexpected subcommand {sub_name}"),
        };

@@ -470,10 +470,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let mut cplane = ComputeControlPlane::load(env.clone())?;
            println!("Importing timeline into pageserver ...");
            pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
-            println!("Creating node for imported timeline ...");
            env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;

-            cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?;
+            println!("Creating endpoint for imported timeline ...");
+            cplane.new_endpoint(tenant_id, name, timeline_id, None, None, pg_version)?;
            println!("Done");
        }
        Some(("branch", branch_match)) => {
@@ -521,10 +521,10 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
    Ok(())
 }

-fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
-    let (sub_name, sub_args) = match pg_match.subcommand() {
-        Some(pg_subcommand_data) => pg_subcommand_data,
-        None => bail!("no pg subcommand provided"),
+fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match ep_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no endpoint subcommand provided"),
    };

    let mut cplane = ComputeControlPlane::load(env.clone())?;
@@ -546,7 +546,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
            table.load_preset(comfy_table::presets::NOTHING);

            table.set_header([
-                "NODE",
+                "ENDPOINT",
                "ADDRESS",
                "TIMELINE",
                "BRANCH NAME",
@@ -554,39 +554,39 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                "STATUS",
            ]);

-            for ((_, node_name), node) in cplane
-                .nodes
+            for (endpoint_id, endpoint) in cplane
+                .endpoints
                .iter()
-                .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
+                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
            {
-                let lsn_str = match node.lsn {
+                let lsn_str = match endpoint.lsn {
                    None => {
-                        // -> primary node
+                        // -> primary endpoint
                        // Use the LSN at the end of the timeline.
                        timeline_infos
-                            .get(&node.timeline_id)
+                            .get(&endpoint.timeline_id)
                            .map(|bi| bi.last_record_lsn.to_string())
                            .unwrap_or_else(|| "?".to_string())
                    }
                    Some(lsn) => {
-                        // -> read-only node
-                        // Use the node's LSN.
+                        // -> read-only endpoint
+                        // Use the endpoint's LSN.
                        lsn.to_string()
                    }
                };

                let branch_name = timeline_name_mappings
-                    .get(&TenantTimelineId::new(tenant_id, node.timeline_id))
+                    .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

                table.add_row([
-                    node_name.as_str(),
-                    &node.address.to_string(),
-                    &node.timeline_id.to_string(),
+                    endpoint_id.as_str(),
+                    &endpoint.address.to_string(),
+                    &endpoint.timeline_id.to_string(),
                    branch_name,
                    lsn_str.as_str(),
-                    node.status(),
+                    endpoint.status(),
                ]);
            }

@@ -597,10 +597,10 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                .get_one::<String>("branch-name")
                .map(|s| s.as_str())
                .unwrap_or(DEFAULT_BRANCH_NAME);
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .map(|node_name| node_name.to_string())
-                .unwrap_or_else(|| format!("{branch_name}_node"));
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .map(String::to_string)
+                .unwrap_or_else(|| format!("ep-{branch_name}"));

            let lsn = sub_args
                .get_one::<String>("lsn")
@@ -618,17 +618,15 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let node =
-                cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
-            println!("{}", node.pgdata().display());
+            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, lsn, port, pg_version)?;
        }
        "start" => {
            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .ok_or_else(|| anyhow!("No node name was provided to start"))?;
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));
+            let endpoint = cplane.endpoints.get(endpoint_id.as_str());

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
@@ -638,9 +636,9 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                None
            };

-            if let Some(node) = node {
-                println!("Starting existing postgres {node_name}...");
-                node.start(&auth_token)?;
+            if let Some(endpoint) = endpoint {
+                println!("Starting existing endpoint {endpoint_id}...");
+                endpoint.start(&auth_token)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -665,27 +663,33 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
                // start --port X
                // stop
                // start <-- will also use port X even without explicit port argument
-                println!("Starting new postgres (v{pg_version}) {node_name} on timeline {timeline_id} ...");
+                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");

-                let node =
-                    cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
-                node.start(&auth_token)?;
+                let ep = cplane.new_endpoint(
+                    tenant_id,
+                    endpoint_id,
+                    timeline_id,
+                    lsn,
+                    port,
+                    pg_version,
+                )?;
+                ep.start(&auth_token)?;
            }
        }
        "stop" => {
-            let node_name = sub_args
-                .get_one::<String>("node")
-                .ok_or_else(|| anyhow!("No node name was provided to stop"))?;
+            let endpoint_id = sub_args
+                .get_one::<String>("endpoint_id")
+                .ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
            let destroy = sub_args.get_flag("destroy");

-            let node = cplane
-                .nodes
-                .get(&(tenant_id, node_name.to_string()))
-                .with_context(|| format!("postgres {node_name} is not found"))?;
-            node.stop(destroy)?;
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
+            endpoint.stop(destroy)?;
        }

-        _ => bail!("Unexpected pg subcommand '{sub_name}'"),
+        _ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
    }

    Ok(())
@@ -804,7 +808,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
 }

 fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    // Postgres nodes are not started automatically
+    // Endpoints are not started automatically

    broker::start_broker_process(env)?;

@@ -838,10 +842,10 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<
 fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    let pageserver = PageServerNode::from_env(env);

-    // Stop all compute nodes
+    // Stop all endpoints
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
-            for (_k, node) in cplane.nodes {
+            for (_k, node) in cplane.endpoints {
                if let Err(e) = node.stop(false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
@@ -874,7 +878,9 @@ fn cli() -> Command {
        .help("Name of the branch to be created or used as an alias for other services")
        .required(false);

-    let pg_node_arg = Arg::new("node").help("Postgres node name").required(false);
+    let endpoint_id_arg = Arg::new("endpoint_id")
+        .help("Postgres endpoint id")
+        .required(false);

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

@@ -1028,27 +1034,27 @@ fn cli() -> Command {
                )
        )
        .subcommand(
-            Command::new("pg")
+            Command::new("endpoint")
                .arg_required_else_help(true)
                .about("Manage postgres instances")
                .subcommand(Command::new("list").arg(tenant_id_arg.clone()))
                .subcommand(Command::new("create")
-                    .about("Create a postgres compute node")
-                    .arg(pg_node_arg.clone())
+                    .about("Create a compute endpoint")
+                    .arg(endpoint_id_arg.clone())
                    .arg(branch_name_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(lsn_arg.clone())
                    .arg(port_arg.clone())
                    .arg(
                        Arg::new("config-only")
-                            .help("Don't do basebackup, create compute node with only config files")
+                            .help("Don't do basebackup, create endpoint directory with only config files")
                            .long("config-only")
                            .required(false))
                    .arg(pg_version_arg.clone())
                )
                .subcommand(Command::new("start")
-                    .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
-                    .arg(pg_node_arg.clone())
+                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
+                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(branch_name_arg)
                    .arg(timeline_id_arg)
@@ -1058,7 +1064,7 @@ fn cli() -> Command {
                )
                .subcommand(
                    Command::new("stop")
-                    .arg(pg_node_arg)
+                    .arg(endpoint_id_arg)
                    .arg(tenant_id_arg)
                    .arg(
                        Arg::new("destroy")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -25,54 +25,45 @@ use crate::postgresql_conf::PostgresConf;
 //
 pub struct ComputeControlPlane {
    base_port: u16,
-    pageserver: Arc<PageServerNode>,
-    pub nodes: BTreeMap<(TenantId, String), Arc<PostgresNode>>,
+
+    // endpoint ID is the key
+    pub endpoints: BTreeMap<String, Arc<Endpoint>>,
+
    env: LocalEnv,
+    pageserver: Arc<PageServerNode>,
 }

 impl ComputeControlPlane {
-    // Load current nodes with ports from data directories on disk
-    // Directory structure has the following layout:
-    // pgdatadirs
-    // |- tenants
-    // |  |- <tenant_id>
-    // |  |   |- <node name>
+    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
        let pageserver = Arc::new(PageServerNode::from_env(&env));

-        let mut nodes = BTreeMap::default();
-        let pgdatadirspath = &env.pg_data_dirs_path();
-
-        for tenant_dir in fs::read_dir(pgdatadirspath)
-            .with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
+        let mut endpoints = BTreeMap::default();
+        for endpoint_dir in fs::read_dir(env.endpoints_path())
+            .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
        {
-            let tenant_dir = tenant_dir?;
-            for timeline_dir in fs::read_dir(tenant_dir.path())
-                .with_context(|| format!("failed to list {}", tenant_dir.path().display()))?
-            {
-                let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?;
-                nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node));
-            }
+            let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
+            endpoints.insert(ep.name.clone(), Arc::new(ep));
        }

        Ok(ComputeControlPlane {
            base_port: 55431,
-            pageserver,
-            nodes,
+            endpoints,
            env,
+            pageserver,
        })
    }

    fn get_port(&mut self) -> u16 {
        1 + self
-            .nodes
+            .endpoints
            .values()
-            .map(|node| node.address.port())
+            .map(|ep| ep.address.port())
            .max()
            .unwrap_or(self.base_port)
    }

-    pub fn new_node(
+    pub fn new_endpoint(
        &mut self,
        tenant_id: TenantId,
        name: &str,
@@ -80,9 +71,9 @@ impl ComputeControlPlane {
        lsn: Option<Lsn>,
        port: Option<u16>,
        pg_version: u32,
-    ) -> Result<Arc<PostgresNode>> {
+    ) -> Result<Arc<Endpoint>> {
        let port = port.unwrap_or_else(|| self.get_port());
-        let node = Arc::new(PostgresNode {
+        let ep = Arc::new(Endpoint {
            name: name.to_owned(),
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            env: self.env.clone(),
@@ -90,44 +81,48 @@ impl ComputeControlPlane {
            timeline_id,
            lsn,
            tenant_id,
-            uses_wal_proposer: false,
            pg_version,
        });

-        node.create_pgdata()?;
-        node.setup_pg_conf()?;
+        ep.create_pgdata()?;
+        ep.setup_pg_conf()?;

-        self.nodes
-            .insert((tenant_id, node.name.clone()), Arc::clone(&node));
+        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));

-        Ok(node)
+        Ok(ep)
    }
 }

 ///////////////////////////////////////////////////////////////////////////////

 #[derive(Debug)]
-pub struct PostgresNode {
-    pub address: SocketAddr,
+pub struct Endpoint {
+    /// used as the directory name
    name: String,
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    // Some(lsn) if this is a read-only endpoint anchored at 'lsn'. None for the primary.
+    pub lsn: Option<Lsn>,
+
+    // port and address of the Postgres server
+    pub address: SocketAddr,
+    pg_version: u32,
+
+    // These are not part of the endpoint as such, but the environment
+    // the endpoint runs in.
    pub env: LocalEnv,
    pageserver: Arc<PageServerNode>,
-    pub timeline_id: TimelineId,
-    pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
-    pub tenant_id: TenantId,
-    uses_wal_proposer: bool,
-    pg_version: u32,
 }

-impl PostgresNode {
+impl Endpoint {
    fn from_dir_entry(
        entry: std::fs::DirEntry,
        env: &LocalEnv,
        pageserver: &Arc<PageServerNode>,
-    ) -> Result<PostgresNode> {
+    ) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
            anyhow::bail!(
-                "PostgresNode::from_dir_entry failed: '{}' is not a directory",
+                "Endpoint::from_dir_entry failed: '{}' is not a directory",
                entry.path().display()
            );
        }
@@ -137,7 +132,7 @@ impl PostgresNode {
        let name = fname.to_str().unwrap().to_string();

        // Read config file into memory
-        let cfg_path = entry.path().join("postgresql.conf");
+        let cfg_path = entry.path().join("pgdata").join("postgresql.conf");
        let cfg_path_str = cfg_path.to_string_lossy();
        let mut conf_file = File::open(&cfg_path)
            .with_context(|| format!("failed to open config file in {}", cfg_path_str))?;
@@ -149,7 +144,6 @@ impl PostgresNode {
        let port: u16 = conf.parse_field("port", &context)?;
        let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
        let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
-        let uses_wal_proposer = conf.get("neon.safekeepers").is_some();

        // Read postgres version from PG_VERSION file to determine which postgres version binary to use.
        // If it doesn't exist, assume broken data directory and use default pg version.
@@ -164,7 +158,7 @@ impl PostgresNode {
            conf.parse_field_optional("recovery_target_lsn", &context)?;

        // ok now
-        Ok(PostgresNode {
+        Ok(Endpoint {
            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
            name,
            env: env.clone(),
@@ -172,7 +166,6 @@ impl PostgresNode {
            timeline_id,
            lsn: recovery_target_lsn,
            tenant_id,
-            uses_wal_proposer,
            pg_version,
        })
    }
@@ -273,7 +266,7 @@ impl PostgresNode {
    }

    // Write postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new node.
+    // and PG_VERSION file to the data directory of a new endpoint.
    fn setup_pg_conf(&self) -> Result<()> {
        let mut conf = PostgresConf::new();
        conf.append("max_wal_senders", "10");
@@ -293,7 +286,7 @@ impl PostgresNode {
        // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
        conf.append("restart_after_crash", "off");

-        // Configure the node to fetch pages from pageserver
+        // Configure the Neon Postgres extension to fetch pages from pageserver
        let pageserver_connstr = {
            let config = &self.pageserver.pg_connection_config;
            let (host, port) = (config.host(), config.port());
@@ -329,7 +322,7 @@ impl PostgresNode {
        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
-            // Configure the node to connect to the safekeepers
+            // Configure Postgres to connect to the safekeepers
            conf.append("synchronous_standby_names", "walproposer");

            let safekeepers = self
@@ -364,7 +357,7 @@ impl PostgresNode {
    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
        let backup_lsn = if let Some(lsn) = self.lsn {
            Some(lsn)
-        } else if self.uses_wal_proposer {
+        } else if !self.env.safekeepers.is_empty() {
            // LSN 0 means that it is bootstrap and we need to download just
            // latest data from the pageserver. That is a bit clumsy but whole bootstrap
            // procedure evolves quite actively right now, so let's think about it again
@@ -384,8 +377,12 @@ impl PostgresNode {
        Ok(())
    }

+    pub fn endpoint_path(&self) -> PathBuf {
+        self.env.endpoints_path().join(&self.name)
+    }
+
    pub fn pgdata(&self) -> PathBuf {
-        self.env.pg_data_dir(&self.tenant_id, &self.name)
+        self.endpoint_path().join("pgdata")
    }

    pub fn status(&self) -> &str {
@@ -447,12 +444,11 @@ impl PostgresNode {
    }

    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
-        // Bail if the node already running.
        if self.status() == "running" {
-            anyhow::bail!("The node is already running");
+            anyhow::bail!("The endpoint is already running");
        }

-        // 1. We always start compute node from scratch, so
+        // 1. We always start Postgres from scratch, so
        // if old dir exists, preserve 'postgresql.conf' and drop the directory
        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
@@ -474,8 +470,8 @@ impl PostgresNode {
            File::create(self.pgdata().join("standby.signal"))?;
        }

-        // 4. Finally start the compute node postgres
-        println!("Starting postgres node at '{}'", self.connstr());
+        // 4. Finally start postgres
+        println!("Starting postgres at '{}'", self.connstr());
        self.pg_ctl(&["start"], auth_token)
    }

@@ -484,7 +480,7 @@ impl PostgresNode {
        // use immediate shutdown mode, otherwise,
        // shutdown gracefully to leave the data directory sane.
        //
-        // Compute node always starts from scratch, so stop
+        // Postgres is always started from scratch, so stop
        // without destroy only used for testing and debugging.
        //
        if destroy {
@@ -493,7 +489,7 @@ impl PostgresNode {
                "Destroying postgres data directory '{}'",
                self.pgdata().to_str().unwrap()
            );
-            fs::remove_dir_all(self.pgdata())?;
+            fs::remove_dir_all(self.endpoint_path())?;
        } else {
            self.pg_ctl(&["stop"], &None)?;
        }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,7 +9,7 @@

 mod background_process;
 pub mod broker;
-pub mod compute;
+pub mod endpoint;
 pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -200,14 +200,8 @@ impl LocalEnv {
        self.neon_distrib_dir.join("storage_broker")
    }

-    pub fn pg_data_dirs_path(&self) -> PathBuf {
-        self.base_data_dir.join("pgdatadirs").join("tenants")
-    }
-
-    pub fn pg_data_dir(&self, tenant_id: &TenantId, branch_name: &str) -> PathBuf {
-        self.pg_data_dirs_path()
-            .join(tenant_id.to_string())
-            .join(branch_name)
+    pub fn endpoints_path(&self) -> PathBuf {
+        self.base_data_dir.join("endpoints")
    }

    // TODO: move pageserver files into ./pageserver
@@ -427,7 +421,7 @@ impl LocalEnv {
            }
        }

-        fs::create_dir_all(self.pg_data_dirs_path())?;
+        fs::create_dir_all(self.endpoints_path())?;

        for safekeeper in &self.safekeepers {
            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -363,6 +363,11 @@ impl PageServerNode {
                .map(|x| serde_json::from_str(x))
                .transpose()
                .context("Failed to parse 'eviction_policy' json")?,
+            min_resident_size_override: settings
+                .remove("min_resident_size_override")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'min_resident_size_override' as integer")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -435,6 +440,11 @@ impl PageServerNode {
                    .map(|x| serde_json::from_str(x))
                    .transpose()
                    .context("Failed to parse 'eviction_policy' json")?,
+                min_resident_size_override: settings
+                    .get("min_resident_size_override")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
            })
            .send()?
            .error_from_body()?;
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "compute_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+chrono.workspace = true
+serde.workspace = true
+serde_with.workspace = true
+serde_json.workspace = true
+
+workspace_hack.workspace = true
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod models;
+pub mod spec;
--- a/libs/compute_api/src/models.rs
+++ b/libs/compute_api/src/models.rs
@@ -0,0 +1,52 @@
+//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
+use chrono::{DateTime, Utc};
+use serde::{Serialize, Serializer};
+
+/// Response of the /status API
+///
+#[derive(Clone, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub struct ComputeState {
+    pub status: ComputeStatus,
+    /// Timestamp of the last Postgres activity
+    #[serde(serialize_with = "rfc3339_serialize")]
+    pub last_active: DateTime<Utc>,
+    pub error: Option<String>,
+}
+
+#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeStatus {
+    // Spec wasn't provided as start, waiting for it to be
+    // provided by control-plane.
+    Empty,
+    // Compute node has spec and initial startup and
+    // configuration is in progress.
+    Init,
+    // Compute is configured and running.
+    Running,
+    // Either startup or configuration failed,
+    // compute will exit soon or is waiting for
+    // control-plane to terminate it.
+    Failed,
+    // Control-plane requested reconfiguration.
+    ConfigurationPending,
+    // New spec is being applied.
+    Configuration,
+}
+
+fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    x.to_rfc3339().serialize(s)
+}
+
+/// Response of the /metrics.json API
+#[derive(Clone, Default, Serialize)]
+pub struct ComputeMetrics {
+    pub sync_safekeepers_ms: u64,
+    pub basebackup_ms: u64,
+    pub config_ms: u64,
+    pub total_startup_ms: u64,
+}
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -0,0 +1,94 @@
+//! `ComputeSpec` represents the contents of the spec.json file.
+//!
+//! The spec.json file is used to pass information to 'compute_ctl'. It contains
+//! all the information needed to start up the right version of PostgreSQL,
+//! and connect it to the storage nodes.
+use serde::Deserialize;
+use std::collections::HashMap;
+
+/// String type alias representing Postgres identifier and
+/// intended to be used for DB / role names.
+pub type PgIdent = String;
+
+/// Cluster spec or configuration represented as an optional number of
+/// delta operations + final cluster state description.
+#[derive(Clone, Debug, Deserialize)]
+pub struct ComputeSpec {
+    pub format_version: f32,
+    pub timestamp: String,
+    pub operation_uuid: Option<String>,
+    /// Expected cluster state at the end of transition process.
+    pub cluster: Cluster,
+    pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub storage_auth_token: Option<String>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct Cluster {
+    pub cluster_id: String,
+    pub name: String,
+    pub state: Option<String>,
+    pub roles: Vec<Role>,
+    pub databases: Vec<Database>,
+    pub settings: GenericOptions,
+}
+
+/// Single cluster state changing operation that could not be represented as
+/// a static `Cluster` structure. For example:
+/// - DROP DATABASE
+/// - DROP ROLE
+/// - ALTER ROLE name RENAME TO new_name
+/// - ALTER DATABASE name RENAME TO new_name
+#[derive(Clone, Debug, Deserialize)]
+pub struct DeltaOp {
+    pub action: String,
+    pub name: PgIdent,
+    pub new_name: Option<PgIdent>,
+}
+
+/// Rust representation of Postgres role info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Role {
+    pub name: PgIdent,
+    pub encrypted_password: Option<String>,
+    pub options: GenericOptions,
+}
+
+/// Rust representation of Postgres database info with only those fields
+/// that matter for us.
+#[derive(Clone, Debug, Deserialize)]
+pub struct Database {
+    pub name: PgIdent,
+    pub owner: PgIdent,
+    pub options: GenericOptions,
+}
+
+/// Common type representing both SQL statement params with or without value,
+/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
+/// options like `wal_level = logical`.
+#[derive(Clone, Debug, Deserialize)]
+pub struct GenericOption {
+    pub name: String,
+    pub value: Option<String>,
+    pub vartype: String,
+}
+
+/// Optional collection of `GenericOption`'s. Type alias allows us to
+/// declare a `trait` on it.
+pub type GenericOptions = Option<Vec<GenericOption>>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs::File;
+
+    #[test]
+    fn parse_spec_file() {
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+    }
+}
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -120,6 +120,7 @@ pub struct TenantCreateRequest {
    // We might do that once the eviction feature has stabilizied.
    // For now, this field is not even documented in the openapi_spec.yml.
    pub eviction_policy: Option<serde_json::Value>,
+    pub min_resident_size_override: Option<u64>,
 }

 #[serde_as]
@@ -165,6 +166,7 @@ pub struct TenantConfigRequest {
    // We might do that once the eviction feature has stabilizied.
    // For now, this field is not even documented in the openapi_spec.yml.
    pub eviction_policy: Option<serde_json::Value>,
+    pub min_resident_size_override: Option<u64>,
 }

 impl TenantConfigRequest {
@@ -185,6 +187,7 @@ impl TenantConfigRequest {
            max_lsn_wal_lag: None,
            trace_read_requests: None,
            eviction_policy: None,
+            min_resident_size_override: None,
        }
    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -936,35 +936,40 @@ impl<'a> BeMessage<'a> {
    }
 }

-// Neon extension of postgres replication protocol
-// See NEON_STATUS_UPDATE_TAG_BYTE
+/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
+/// Serialized in custom flexible key/value format. In replication protocol, it
+/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
+/// Standby status update / Hot standby feedback messages.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct ReplicationFeedback {
-    // Last known size of the timeline. Used to enforce timeline size limit.
+pub struct PageserverFeedback {
+    /// Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
-    // Parts of StandbyStatusUpdate we resend to compute via safekeeper
-    pub ps_writelsn: u64,
-    pub ps_applylsn: u64,
-    pub ps_flushlsn: u64,
-    pub ps_replytime: SystemTime,
+    /// LSN last received and ingested by the pageserver.
+    pub last_received_lsn: u64,
+    /// LSN up to which data is persisted by the pageserver to its local disc.
+    pub disk_consistent_lsn: u64,
+    /// LSN up to which data is persisted by the pageserver on s3; safekeepers
+    /// consider WAL before it can be removed.
+    pub remote_consistent_lsn: u64,
+    pub replytime: SystemTime,
 }

-// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
 // Do not remove previously available fields because this might be backwards incompatible.
-pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;

-impl ReplicationFeedback {
-    pub fn empty() -> ReplicationFeedback {
-        ReplicationFeedback {
+impl PageserverFeedback {
+    pub fn empty() -> PageserverFeedback {
+        PageserverFeedback {
            current_timeline_size: 0,
-            ps_writelsn: 0,
-            ps_applylsn: 0,
-            ps_flushlsn: 0,
-            ps_replytime: SystemTime::now(),
+            last_received_lsn: 0,
+            remote_consistent_lsn: 0,
+            disk_consistent_lsn: 0,
+            replytime: SystemTime::now(),
        }
    }

-    // Serialize ReplicationFeedback using custom format
+    // Serialize PageserverFeedback using custom format
    // to support protocol extensibility.
    //
    // Following layout is used:
@@ -974,24 +979,26 @@ impl ReplicationFeedback {
    // null-terminated string - key,
    // uint32 - value length in bytes
    // value itself
+    //
+    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
-        buf.put_u64(self.ps_writelsn);
+        buf.put_u64(self.last_received_lsn);
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
-        buf.put_u64(self.ps_flushlsn);
+        buf.put_u64(self.disk_consistent_lsn);
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
-        buf.put_u64(self.ps_applylsn);
+        buf.put_u64(self.remote_consistent_lsn);

        let timestamp = self
-            .ps_replytime
+            .replytime
            .duration_since(*PG_EPOCH)
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;
@@ -1001,9 +1008,10 @@ impl ReplicationFeedback {
        buf.put_i64(timestamp);
    }

-    // Deserialize ReplicationFeedback message
-    pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
-        let mut rf = ReplicationFeedback::empty();
+    // Deserialize PageserverFeedback message
+    // TODO: change serialized fields names once all computes migrate to rename.
+    pub fn parse(mut buf: Bytes) -> PageserverFeedback {
+        let mut rf = PageserverFeedback::empty();
        let nfields = buf.get_u8();
        for _ in 0..nfields {
            let key = read_cstr(&mut buf).unwrap();
@@ -1016,39 +1024,39 @@ impl ReplicationFeedback {
                b"ps_writelsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
-                    rf.ps_writelsn = buf.get_u64();
+                    rf.last_received_lsn = buf.get_u64();
                }
                b"ps_flushlsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
-                    rf.ps_flushlsn = buf.get_u64();
+                    rf.disk_consistent_lsn = buf.get_u64();
                }
                b"ps_applylsn" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
-                    rf.ps_applylsn = buf.get_u64();
+                    rf.remote_consistent_lsn = buf.get_u64();
                }
                b"ps_replytime" => {
                    let len = buf.get_i32();
                    assert_eq!(len, 8);
                    let raw_time = buf.get_i64();
                    if raw_time > 0 {
-                        rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
+                        rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
                    } else {
-                        rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
+                        rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
                    }
                }
                _ => {
                    let len = buf.get_i32();
                    warn!(
-                        "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
+                        "PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
                        String::from_utf8_lossy(key.as_ref())
                    );
                    buf.advance(len as usize);
                }
            }
        }
-        trace!("ReplicationFeedback parsed is {:?}", rf);
+        trace!("PageserverFeedback parsed is {:?}", rf);
        rf
    }
 }
@@ -1059,33 +1067,33 @@ mod tests {

    #[test]
    fn test_replication_feedback_serialization() {
-        let mut rf = ReplicationFeedback::empty();
+        let mut rf = PageserverFeedback::empty();
        // Fill rf with some values
        rf.current_timeline_size = 12345678;
        // Set rounded time to be able to compare it with deserialized value,
        // because it is rounded up to microseconds during serialization.
-        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
        let mut data = BytesMut::new();
        rf.serialize(&mut data);

-        let rf_parsed = ReplicationFeedback::parse(data.freeze());
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
        assert_eq!(rf, rf_parsed);
    }

    #[test]
    fn test_replication_feedback_unknown_key() {
-        let mut rf = ReplicationFeedback::empty();
+        let mut rf = PageserverFeedback::empty();
        // Fill rf with some values
        rf.current_timeline_size = 12345678;
        // Set rounded time to be able to compare it with deserialized value,
        // because it is rounded up to microseconds during serialization.
-        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
+        rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
        let mut data = BytesMut::new();
        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
        if let Some(first) = data.first_mut() {
-            *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
        }

        data.put_slice(b"new_field_one\0");
@@ -1093,7 +1101,7 @@ mod tests {
        data.put_u64(42);

        // Parse serialized data and check that new field is not parsed
-        let rf_parsed = ReplicationFeedback::parse(data.freeze());
+        let rf_parsed = PageserverFeedback::parse(data.freeze());
        assert_eq!(rf, rf_parsed);
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -78,9 +78,6 @@ impl RemotePath {
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
-
    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -73,10 +73,8 @@ impl LocalFs {
            Ok(None)
        }
    }
-}

-#[async_trait::async_trait]
-impl RemoteStorage for LocalFs {
+    #[cfg(test)]
    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        Ok(get_all_files(&self.storage_root, true)
            .await?
@@ -91,7 +89,10 @@ impl RemoteStorage for LocalFs {
            })
            .collect())
    }
+}

+#[async_trait::async_trait]
+impl RemoteStorage for LocalFs {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -275,50 +275,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
-        let mut document_keys = Vec::new();
-
-        let mut continuation_token = None;
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")?;
-
-            metrics::inc_list_objects();
-
-            let fetch_response = self
-                .client
-                .list_objects_v2()
-                .bucket(self.bucket_name.clone())
-                .set_prefix(self.prefix_in_bucket.clone())
-                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
-                .set_continuation_token(continuation_token)
-                .set_max_keys(self.max_keys_per_list_response)
-                .send()
-                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })?;
-            document_keys.extend(
-                fetch_response
-                    .contents
-                    .unwrap_or_default()
-                    .into_iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
-            );
-
-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
-                None => break,
-            }
-        }
-
-        Ok(document_keys)
-    }
-
    /// See the doc for `RemoteStorage::list_prefixes`
    /// Note: it wont include empty "directories"
    async fn list_prefixes(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -20,7 +20,6 @@ pub struct UnreliableWrapper {
 /// Used to identify retries of different unique operation.
 #[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
-    List,
    ListPrefixes(Option<RemotePath>),
    Upload(RemotePath),
    Download(RemotePath),
@@ -75,12 +74,6 @@ impl UnreliableWrapper {

 #[async_trait::async_trait]
 impl RemoteStorage for UnreliableWrapper {
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
-        self.attempt(RemoteOp::List)?;
-        self.inner.list().await
-    }
-
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,6 +19,7 @@ jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
+regex.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -51,6 +51,9 @@ pub mod history_buffer;

 pub mod measured_stream;

+pub mod serde_percent;
+pub mod serde_regex;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
--- a/libs/utils/src/serde_percent.rs
+++ b/libs/utils/src/serde_percent.rs
@@ -0,0 +1,83 @@
+//! A serde::Deserialize type for percentages.
+//!
+//! See [`Percent`] for details.
+
+use serde::{Deserialize, Serialize};
+
+/// If the value is not an integer between 0 and 100,
+/// deserialization fails with a descriptive error.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8);
+
+impl Percent {
+    pub fn get(&self) -> u8 {
+        self.0
+    }
+}
+
+fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u8, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let v: u8 = serde::de::Deserialize::deserialize(deserializer)?;
+    if v > 100 {
+        return Err(serde::de::Error::custom(
+            "must be an integer between 0 and 100",
+        ));
+    }
+    Ok(v)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Percent;
+
+    #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)]
+    struct Foo {
+        bar: Percent,
+    }
+
+    #[test]
+    fn basics() {
+        let input = r#"{ "bar": 50 }"#;
+        let foo: Foo = serde_json::from_str(input).unwrap();
+        assert_eq!(foo.bar.get(), 50);
+    }
+    #[test]
+    fn null_handling() {
+        let input = r#"{ "bar": null }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn zero() {
+        let input = r#"{ "bar": 0 }"#;
+        let foo: Foo = serde_json::from_str(input).unwrap();
+        assert_eq!(foo.bar.get(), 0);
+    }
+    #[test]
+    fn out_of_range_above() {
+        let input = r#"{ "bar": 101 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn out_of_range_below() {
+        let input = r#"{ "bar": -1 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn float() {
+        let input = r#"{ "bar": 50.5 }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+    #[test]
+    fn string() {
+        let input = r#"{ "bar": "50 %" }"#;
+        let res: Result<Foo, _> = serde_json::from_str(input);
+        assert!(res.is_err());
+    }
+}
--- a/libs/utils/src/serde_regex.rs
+++ b/libs/utils/src/serde_regex.rs
@@ -0,0 +1,60 @@
+//! A `serde::{Deserialize,Serialize}` type for regexes.
+
+use std::ops::Deref;
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct Regex(
+    #[serde(
+        deserialize_with = "deserialize_regex",
+        serialize_with = "serialize_regex"
+    )]
+    regex::Regex,
+);
+
+fn deserialize_regex<'de, D>(deserializer: D) -> Result<regex::Regex, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?;
+    Ok(re)
+}
+
+fn serialize_regex<S>(re: &regex::Regex, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::ser::Serializer,
+{
+    serializer.collect_str(re.as_str())
+}
+
+impl Deref for Regex {
+    type Target = regex::Regex;
+
+    fn deref(&self) -> &regex::Regex {
+        &self.0
+    }
+}
+
+impl PartialEq for Regex {
+    fn eq(&self, other: &Regex) -> bool {
+        // comparing the automatons would be quite complicated
+        self.as_str() == other.as_str()
+    }
+}
+
+impl Eq for Regex {}
+
+#[cfg(test)]
+mod tests {
+
+    #[test]
+    fn roundtrip() {
+        let input = r#""foo.*bar""#;
+        let re: super::Regex = serde_json::from_str(input).unwrap();
+        assert!(re.is_match("foo123bar"));
+        assert!(!re.is_match("foo"));
+        let output = serde_json::to_string(&re).unwrap();
+        assert_eq!(output, input);
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -48,6 +48,7 @@ serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use remote_storage::GenericRemoteStorage;
 use tracing::*;

@@ -314,14 +315,34 @@ fn start_pageserver(
    // Scan the local 'tenants/' directory and start loading the tenants
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;

+    // shared state between the disk-usage backed eviction background task and the http endpoint
+    // that allows triggering disk-usage based eviction manually. note that the http endpoint
+    // is still accessible even if background task is not configured as long as remote storage has
+    // been configured.
+    let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
+
+    if let Some(remote_storage) = &remote_storage {
+        launch_disk_usage_global_eviction_task(
+            conf,
+            remote_storage.clone(),
+            disk_usage_eviction_state.clone(),
+        )?;
+    }
+
    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(conf, launch_ts, http_auth, remote_storage)?
-            .build()
-            .map_err(|err| anyhow!(err))?;
+        let router = http::make_router(
+            conf,
+            launch_ts,
+            http_auth,
+            remote_storage,
+            disk_usage_eviction_state,
+        )?
+        .build()
+        .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -27,6 +27,7 @@ use utils::{
    logging::LogFormat,
 };

+use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
@@ -92,6 +93,8 @@ pub mod defaults {

 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'

+#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -104,6 +107,8 @@ pub mod defaults {
 #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
 #pitr_interval = '{DEFAULT_PITR_INTERVAL}'

+#min_resident_size_override = .. # in bytes
+
 # [remote_storage]

 "###
@@ -180,6 +185,8 @@ pub struct PageServerConf {
    // See the corresponding metric's help string.
    pub evictions_low_residence_duration_metric_threshold: Duration,

+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -252,6 +259,8 @@ struct PageServerConfigBuilder {

    evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,

+    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
+
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
@@ -312,6 +321,8 @@ impl Default for PageServerConfigBuilder {
            )
            .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),

+            disk_usage_based_eviction: Set(None),
+
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
@@ -431,6 +442,10 @@ impl PageServerConfigBuilder {
        self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
    }

+    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
+        self.disk_usage_based_eviction = BuilderValue::Set(value);
+    }
+
    pub fn ondemand_download_behavior_treat_error_as_warn(
        &mut self,
        ondemand_download_behavior_treat_error_as_warn: bool,
@@ -515,6 +530,9 @@ impl PageServerConfigBuilder {
                .ok_or(anyhow!(
                    "missing evictions_low_residence_duration_metric_threshold"
                ))?,
+            disk_usage_based_eviction: self
+                .disk_usage_based_eviction
+                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
            test_remote_failures: self
                .test_remote_failures
                .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -704,6 +722,12 @@ impl PageServerConf {
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
                "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
+                "disk_usage_based_eviction" => {
+                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
+                    builder.disk_usage_based_eviction(
+                    toml_edit::de::from_item(item.clone())
+                    .context("parse disk_usage_based_eviction")?)
+                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -808,6 +832,13 @@ impl PageServerConf {
            );
        }

+        if let Some(item) = item.get("min_resident_size_override") {
+            t_conf.min_resident_size_override = Some(
+                toml_edit::de::from_item(item.clone())
+                    .context("parse min_resident_size_override")?,
+            );
+        }
+
        Ok(t_conf)
    }

@@ -850,6 +881,7 @@ impl PageServerConf {
                defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
            )
            .unwrap(),
+            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
        }
@@ -1058,6 +1090,7 @@ log_format = 'json'
                evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
                    defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
                )?,
+                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
@@ -1112,6 +1145,7 @@ log_format = 'json'
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                synthetic_size_calculation_interval: Duration::from_secs(333),
                evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
+                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -0,0 +1,689 @@
+//! This module implements the pageserver-global disk-usage-based layer eviction task.
+//!
+//! # Mechanics
+//!
+//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
+//! loop that evicts layers in response to a shortage of available bytes
+//! in the $repo/tenants directory's filesystem.
+//!
+//! The loop runs periodically at a configurable `period`.
+//!
+//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
+//! It compares the returned usage data against two different types of thresholds.
+//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
+//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
+//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
+//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
+//!
+//! # Eviction Policy
+//!
+//! There are two thresholds:
+//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
+//! If the actual usage is higher, the threshold is exceeded.
+//! `min_avail_bytes` is the absolute available space in bytes.
+//! If the actual usage is lower, the threshold is exceeded.
+//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
+//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
+//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
+//! The reservation is to keep the most recently accessed X bytes per tenant resident.
+//! If we cannot relieve pressure by evicting layers outside of the reservation, we
+//! start evicting layers that are part of the reservation, LRU first.
+//!
+//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size`
+//! throughout the code, but, no actual variable carries that name.
+//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`.
+//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
+//! during page reconstruction.
+//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
+//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
+
+// Implementation notes:
+// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
+//   reading these fields. We use the Debug impl for semi-structured logging, though.
+
+use std::{
+    collections::HashMap,
+    path::Path,
+    sync::Arc,
+    time::{Duration, SystemTime},
+};
+
+use anyhow::Context;
+use remote_storage::GenericRemoteStorage;
+use serde::{Deserialize, Serialize};
+use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::serde_percent::Percent;
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    tenant::{self, storage_layer::PersistentLayer, Timeline},
+};
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct DiskUsageEvictionTaskConfig {
+    pub max_usage_pct: Percent,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[cfg(feature = "testing")]
+    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
+}
+
+#[derive(Default)]
+pub struct State {
+    /// Exclude http requests and background task from running at the same time.
+    mutex: tokio::sync::Mutex<()>,
+}
+
+pub fn launch_disk_usage_global_eviction_task(
+    conf: &'static PageServerConf,
+    storage: GenericRemoteStorage,
+    state: Arc<State>,
+) -> anyhow::Result<()> {
+    let Some(task_config) = &conf.disk_usage_based_eviction else {
+        info!("disk usage based eviction task not configured");
+        return Ok(());
+    };
+
+    info!("launching disk usage based eviction task");
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "disk usage based eviction",
+        false,
+        async move {
+            disk_usage_eviction_task(
+                &state,
+                task_config,
+                storage,
+                &conf.tenants_path(),
+                task_mgr::shutdown_token(),
+            )
+            .await;
+            info!("disk usage based eviction task finishing");
+            Ok(())
+        },
+    );
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+async fn disk_usage_eviction_task(
+    state: &State,
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: GenericRemoteStorage,
+    tenants_dir: &Path,
+    cancel: CancellationToken,
+) {
+    use crate::tenant::tasks::random_init_delay;
+    {
+        if random_init_delay(task_config.period, &cancel)
+            .await
+            .is_err()
+        {
+            info!("shutting down");
+            return;
+        }
+    }
+
+    let mut iteration_no = 0;
+    loop {
+        iteration_no += 1;
+        let start = Instant::now();
+
+        async {
+            let res = disk_usage_eviction_task_iteration(
+                state,
+                task_config,
+                &storage,
+                tenants_dir,
+                &cancel,
+            )
+            .await;
+
+            match res {
+                Ok(()) => {}
+                Err(e) => {
+                    // these stat failures are expected to be very rare
+                    warn!("iteration failed, unexpected error: {e:#}");
+                }
+            }
+        }
+        .instrument(tracing::info_span!("iteration", iteration_no))
+        .await;
+
+        let sleep_until = start + task_config.period;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                info!("shutting down");
+                break
+            }
+        }
+    }
+}
+
+pub trait Usage: Clone + Copy + std::fmt::Debug {
+    fn has_pressure(&self) -> bool;
+    fn add_available_bytes(&mut self, bytes: u64);
+}
+
+async fn disk_usage_eviction_task_iteration(
+    state: &State,
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
+    tenants_dir: &Path,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+        .context("get filesystem-level disk usage before evictions")?;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
+    match res {
+        Ok(outcome) => {
+            debug!(?outcome, "disk_usage_eviction_iteration finished");
+            match outcome {
+                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
+                    // nothing to do, select statement below will handle things
+                }
+                IterationOutcome::Finished(outcome) => {
+                    // Verify with statvfs whether we made any real progress
+                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
+                        .context("get filesystem-level disk usage after evictions")?;
+
+                    debug!(?after, "disk usage");
+
+                    if after.has_pressure() {
+                        // Don't bother doing an out-of-order iteration here now.
+                        // In practice, the task period is set to a value in the tens-of-seconds range,
+                        // which will cause another iteration to happen soon enough.
+                        // TODO: deltas between the three different usages would be helpful,
+                        // consider MiB, GiB, TiB
+                        warn!(?outcome, ?after, "disk usage still high");
+                    } else {
+                        info!(?outcome, ?after, "disk usage pressure relieved");
+                    }
+                }
+            }
+        }
+        Err(e) => {
+            error!("disk_usage_eviction_iteration failed: {:#}", e);
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Debug, Serialize)]
+#[allow(clippy::large_enum_variant)]
+pub enum IterationOutcome<U> {
+    NoPressure,
+    Cancelled,
+    Finished(IterationOutcomeFinished<U>),
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+pub struct IterationOutcomeFinished<U> {
+    /// The actual usage observed before we started the iteration.
+    before: U,
+    /// The expected value for `after`, according to internal accounting, after phase 1.
+    planned: PlannedUsage<U>,
+    /// The outcome of phase 2, where we actually do the evictions.
+    ///
+    /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
+    /// be the same as `planned`.
+    assumed: AssumedUsage<U>,
+}
+
+#[derive(Debug, Serialize)]
+#[allow(dead_code)]
+struct AssumedUsage<U> {
+    /// The expected value for `after`, after phase 2.
+    projected_after: U,
+    /// The layers we failed to evict during phase 2.
+    failed: LayerCount,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+struct PlannedUsage<U> {
+    respecting_tenant_min_resident_size: U,
+    fallback_to_global_lru: Option<U>,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Default, Serialize)]
+struct LayerCount {
+    file_sizes: u64,
+    count: usize,
+}
+
+pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+    state: &State,
+    storage: &GenericRemoteStorage,
+    usage_pre: U,
+    cancel: &CancellationToken,
+) -> anyhow::Result<IterationOutcome<U>> {
+    // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
+    let _g = state
+        .mutex
+        .try_lock()
+        .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
+
+    debug!(?usage_pre, "disk usage");
+
+    if !usage_pre.has_pressure() {
+        return Ok(IterationOutcome::NoPressure);
+    }
+
+    warn!(
+        ?usage_pre,
+        "running disk usage based eviction due to pressure"
+    );
+
+    let candidates = match collect_eviction_candidates(cancel).await? {
+        EvictionCandidates::Cancelled => {
+            return Ok(IterationOutcome::Cancelled);
+        }
+        EvictionCandidates::Finished(partitioned) => partitioned,
+    };
+
+    // Debug-log the list of candidates
+    let now = SystemTime::now();
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        debug!(
+            "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}",
+            i + 1,
+            candidates.len(),
+            candidate.layer.file_size(),
+            now.duration_since(candidate.last_activity_ts)
+                .unwrap()
+                .as_micros(),
+            partition,
+            candidate.layer.get_tenant_id(),
+            candidate.layer.get_timeline_id(),
+            candidate.layer.filename().file_name(),
+        );
+    }
+
+    // phase1: select victims to relieve pressure
+    //
+    // Walk through the list of candidates, until we have accumulated enough layers to get
+    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
+    // how much disk space would be used after evicting all the layers up to the current
+    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    //
+    // If we get far enough in the list that we start to evict layers that are below
+    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
+    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
+    let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
+    let mut warned = None;
+    let mut usage_planned = usage_pre;
+    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+        if !usage_planned.has_pressure() {
+            debug!(
+                no_candidates_evicted = i,
+                "took enough candidates for pressure to be relieved"
+            );
+            break;
+        }
+
+        if partition == MinResidentSizePartition::Below && warned.is_none() {
+            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
+            warned = Some(usage_planned);
+        }
+
+        usage_planned.add_available_bytes(candidate.layer.file_size());
+
+        batched
+            .entry(TimelineKey(candidate.timeline))
+            .or_default()
+            .push(candidate.layer);
+    }
+
+    let usage_planned = match warned {
+        Some(respecting_tenant_min_resident_size) => PlannedUsage {
+            respecting_tenant_min_resident_size,
+            fallback_to_global_lru: Some(usage_planned),
+        },
+        None => PlannedUsage {
+            respecting_tenant_min_resident_size: usage_planned,
+            fallback_to_global_lru: None,
+        },
+    };
+    debug!(?usage_planned, "usage planned");
+
+    // phase2: evict victims batched by timeline
+
+    // After the loop, `usage_assumed` is the post-eviction usage,
+    // according to internal accounting.
+    let mut usage_assumed = usage_pre;
+    let mut evictions_failed = LayerCount::default();
+    for (timeline, batch) in batched {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+        let batch_size = batch.len();
+
+        debug!(%timeline_id, "evicting batch for timeline");
+
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), batch.len());
+                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        match result {
+                            Some(Ok(true)) => {
+                                usage_assumed.add_available_bytes(layer.file_size());
+                            }
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
+                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
+                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;
+
+        if cancel.is_cancelled() {
+            return Ok(IterationOutcome::Cancelled);
+        }
+    }
+
+    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
+        before: usage_pre,
+        planned: usage_planned,
+        assumed: AssumedUsage {
+            projected_after: usage_assumed,
+            failed: evictions_failed,
+        },
+    }))
+}
+
+#[derive(Clone)]
+struct EvictionCandidate {
+    timeline: Arc<Timeline>,
+    layer: Arc<dyn PersistentLayer>,
+    last_activity_ts: SystemTime,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+enum MinResidentSizePartition {
+    Above,
+    Below,
+}
+
+enum EvictionCandidates {
+    Cancelled,
+    Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
+}
+
+/// Gather the eviction candidates.
+///
+/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction
+/// order. A caller that evicts in that order, until pressure is relieved, implements
+/// the eviction policy outlined in the module comment.
+///
+/// # Example
+///
+/// Imagine that there are two tenants, A and B, with five layers each, a-e.
+/// Each layer has size 100, and both tenant's min_resident_size is 150.
+/// The eviction order would be
+///
+/// ```text
+/// partition last_activity_ts    tenant/layer
+/// Above     18:30               A/c
+/// Above     19:00               A/b
+/// Above     18:29               B/c
+/// Above     19:05               B/b
+/// Above     20:00               B/a
+/// Above     20:03               A/a
+/// Below     20:30               A/d
+/// Below     20:40               B/d
+/// Below     20:45               B/e
+/// Below     20:58               A/e
+/// ```
+///
+/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`.
+/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size.
+///
+/// But, if we need to evict 900 bytes to relieve pressure, we'd evict
+/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition
+/// after exhauting the `Above` partition.
+/// So, we did not respect each tenant's min_resident_size.
+async fn collect_eviction_candidates(
+    cancel: &CancellationToken,
+) -> anyhow::Result<EvictionCandidates> {
+    // get a snapshot of the list of tenants
+    let tenants = tenant::mgr::list_tenants()
+        .await
+        .context("get list of tenants")?;
+
+    let mut candidates = Vec::new();
+
+    for (tenant_id, _state) in &tenants {
+        if cancel.is_cancelled() {
+            return Ok(EvictionCandidates::Cancelled);
+        }
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+            Ok(tenant) => tenant,
+            Err(e) => {
+                // this can happen if tenant has lifecycle transition after we fetched it
+                debug!("failed to get tenant: {e:#}");
+                continue;
+            }
+        };
+
+        // collect layers from all timelines in this tenant
+        //
+        // If one of the timelines becomes `!is_active()` during the iteration,
+        // for example because we're shutting down, then `max_layer_size` can be too small.
+        // That's OK. This code only runs under a disk pressure situation, and being
+        // a little unfair to tenants during shutdown in such a situation is tolerable.
+        let mut tenant_candidates = Vec::new();
+        let mut max_layer_size = 0;
+        for tl in tenant.list_timelines() {
+            if !tl.is_active() {
+                continue;
+            }
+            let info = tl.get_local_layers_for_disk_usage_eviction();
+            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            tenant_candidates.extend(
+                info.resident_layers
+                    .into_iter()
+                    .map(|layer_infos| (tl.clone(), layer_infos)),
+            );
+            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
+
+            if cancel.is_cancelled() {
+                return Ok(EvictionCandidates::Cancelled);
+            }
+        }
+
+        // `min_resident_size` defaults to maximum layer file size of the tenant.
+        // This ensures that each tenant can have at least one layer resident at a given time,
+        // ensuring forward progress for a single Timeline::get in that tenant.
+        // It's a questionable heuristic since, usually, there are many Timeline::get
+        // requests going on for a tenant, and, at least in Neon prod, the median
+        // layer file size is much smaller than the compaction target size.
+        // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
+        // That's what's typically used by the various background loops.
+        //
+        // The default can be overriden with a fixed value in the tenant conf.
+        // A default override can be put in the default tenant conf in the pageserver.toml.
+        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
+            debug!(
+                tenant_id=%tenant.tenant_id(),
+                overriden_size=s,
+                "using overridden min resident size for tenant"
+            );
+            s
+        } else {
+            debug!(
+                tenant_id=%tenant.tenant_id(),
+                max_layer_size,
+                "using max layer size as min_resident_size for tenant",
+            );
+            max_layer_size
+        };
+
+        // Sort layers most-recently-used first, then partition by
+        // cumsum above/below min_resident_size.
+        tenant_candidates
+            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
+        let mut cumsum: i128 = 0;
+        for (timeline, layer_info) in tenant_candidates.into_iter() {
+            let file_size = layer_info.file_size();
+            let candidate = EvictionCandidate {
+                timeline,
+                last_activity_ts: layer_info.last_activity_ts,
+                layer: layer_info.layer,
+            };
+            let partition = if cumsum > min_resident_size as i128 {
+                MinResidentSizePartition::Above
+            } else {
+                MinResidentSizePartition::Below
+            };
+            candidates.push((partition, candidate));
+            cumsum += i128::from(file_size);
+        }
+    }
+
+    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
+        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+
+    Ok(EvictionCandidates::Finished(candidates))
+}
+
+struct TimelineKey(Arc<Timeline>);
+
+impl PartialEq for TimelineKey {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for TimelineKey {}
+
+impl std::hash::Hash for TimelineKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl std::ops::Deref for TimelineKey {
+    type Target = Timeline;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+mod filesystem_level_usage {
+    use std::path::Path;
+
+    use anyhow::Context;
+
+    use crate::statvfs::Statvfs;
+
+    use super::DiskUsageEvictionTaskConfig;
+
+    #[derive(Debug, Clone, Copy)]
+    #[allow(dead_code)]
+    pub struct Usage<'a> {
+        config: &'a DiskUsageEvictionTaskConfig,
+
+        /// Filesystem capacity
+        total_bytes: u64,
+        /// Free filesystem space
+        avail_bytes: u64,
+    }
+
+    impl super::Usage for Usage<'_> {
+        fn has_pressure(&self) -> bool {
+            let usage_pct =
+                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+
+            let pressures = [
+                (
+                    "min_avail_bytes",
+                    self.avail_bytes < self.config.min_avail_bytes,
+                ),
+                (
+                    "max_usage_pct",
+                    usage_pct > self.config.max_usage_pct.get() as u64,
+                ),
+            ];
+
+            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.avail_bytes += bytes;
+        }
+    }
+
+    pub fn get<'a>(
+        tenants_dir: &Path,
+        config: &'a DiskUsageEvictionTaskConfig,
+    ) -> anyhow::Result<Usage<'a>> {
+        let mock_config = {
+            #[cfg(feature = "testing")]
+            {
+                config.mock_statvfs.as_ref()
+            }
+            #[cfg(not(feature = "testing"))]
+            {
+                None
+            }
+        };
+
+        let stat = Statvfs::get(tenants_dir, mock_config)
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if stat.fragment_size() > 0 {
+            stat.fragment_size()
+        } else {
+            stat.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = stat.blocks_available() * blocksize;
+        let total_bytes = stat.blocks() * blocksize;
+
+        Ok(Usage {
+            config,
+            total_bytes,
+            avail_bytes,
+        })
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -27,6 +27,31 @@ paths:
                  id:
                    type: integer

+  /v1/disk_usage_eviction/run:
+    put:
+      description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space.
+      security: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - evict_bytes
+              properties:
+                evict_bytes:
+                  type: integer
+      responses:
+        "200":
+          description: |
+            The run completed.
+            This does not necessarily mean that we actually evicted `evict_bytes`.
+            Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`.
+          content:
+            application/json:
+              schema:
+                type: object
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,6 +18,7 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::disk_usage_eviction_task;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
@@ -48,6 +49,7 @@ struct State {
    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
+    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }

 impl State {
@@ -55,6 +57,7 @@ impl State {
        conf: &'static PageServerConf,
        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
+        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
    ) -> anyhow::Result<Self> {
        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
            .iter()
@@ -65,6 +68,7 @@ impl State {
            auth,
            allowlist_routes,
            remote_storage,
+            disk_usage_eviction_state,
        })
    }
 }
@@ -775,6 +779,8 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        );
    }

+    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
    let target_tenant_id = request_data
        .new_tenant_id
        .map(TenantId::from)
@@ -906,6 +912,8 @@ async fn update_tenant_config_handler(
        );
    }

+    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
@@ -914,6 +922,20 @@ async fn update_tenant_config_handler(
    json_response(StatusCode::OK, ())
 }

+/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
+#[cfg(feature = "testing")]
+async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
+
+    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test");
+
+    json_response(StatusCode::OK, ())
+}
+
 #[cfg(feature = "testing")]
 async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
    if !fail::has_failpoints() {
@@ -1063,6 +1085,89 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
    json_response(StatusCode::NO_CONTENT, ())
 }

+async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+
+    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
+    struct Config {
+        /// How many bytes to evict before reporting that pressure is relieved.
+        evict_bytes: u64,
+    }
+
+    #[derive(Debug, Clone, Copy, serde::Serialize)]
+    struct Usage {
+        // remains unchanged after instantiation of the struct
+        config: Config,
+        // updated by `add_available_bytes`
+        freed_bytes: u64,
+    }
+
+    impl crate::disk_usage_eviction_task::Usage for Usage {
+        fn has_pressure(&self) -> bool {
+            self.config.evict_bytes > self.freed_bytes
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.freed_bytes += bytes;
+        }
+    }
+
+    let config = json_request::<Config>(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    let usage = Usage {
+        config,
+        freed_bytes: 0,
+    };
+
+    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
+
+    let (tx, rx) = tokio::sync::oneshot::channel();
+
+    let state = get_state(&r);
+
+    let Some(storage) = state.remote_storage.clone() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run eviction iteration"
+        )))
+    };
+
+    let state = state.disk_usage_eviction_state.clone();
+
+    let cancel = CancellationToken::new();
+    let child_cancel = cancel.clone();
+    let _g = cancel.drop_guard();
+
+    crate::task_mgr::spawn(
+        MGMT_REQUEST_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "ondemand disk usage eviction",
+        false,
+        async move {
+            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+                &state,
+                &storage,
+                usage,
+                &child_cancel,
+            )
+            .await;
+
+            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+
+            let _ = tx.send(res);
+            Ok(())
+        }
+        .in_current_span(),
+    );
+
+    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1075,6 +1180,7 @@ pub fn make_router(
    launch_ts: &'static LaunchTimestamp,
    auth: Option<Arc<JwtAuth>>,
    remote_storage: Option<GenericRemoteStorage>,
+    disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1119,7 +1225,8 @@ pub fn make_router(

    Ok(router
        .data(Arc::new(
-            State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
+            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
+                .context("Failed to initialize router state")?,
        ))
        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
        .put(
@@ -1200,6 +1307,13 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
        )
+        .put("/v1/disk_usage_eviction/run", |r| {
+            RequestSpan(disk_usage_eviction_run).handle(r)
+        })
+        .put(
+            "/v1/tenant/:tenant_id/break",
+            testing_api!("set tenant state to broken", handle_tenant_break),
+        )
        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -4,6 +4,7 @@ pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
+pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -12,6 +13,7 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod repository;
+pub(crate) mod statvfs;
 pub mod task_mgr;
 pub mod tenant;
 pub mod trace;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration {
    }

    pub fn observe(&self, observed_value: Duration) {
-        if self.threshold < observed_value {
+        if observed_value < self.threshold {
            self.counter
                .as_ref()
                .expect("nobody calls this function after `remove_from_vec`")
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -0,0 +1,150 @@
+//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.
+
+use std::path::Path;
+
+pub enum Statvfs {
+    Real(nix::sys::statvfs::Statvfs),
+    Mock(mock::Statvfs),
+}
+
+// NB: on macOS, the block count type of struct statvfs is u32.
+// The workaround seems to be to use the non-standard statfs64 call.
+// Sincce it should only be a problem on > 2TiB disks, let's ignore
+// the problem for now and upcast to u64.
+impl Statvfs {
+    pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
+        if let Some(mocked) = mocked {
+            Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
+        } else {
+            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
+        }
+    }
+
+    // NB: allow() because the block count type is u32 on macOS.
+    #[allow(clippy::useless_conversion)]
+    pub fn blocks(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
+            Statvfs::Mock(stat) => stat.blocks,
+        }
+    }
+
+    // NB: allow() because the block count type is u32 on macOS.
+    #[allow(clippy::useless_conversion)]
+    pub fn blocks_available(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
+            Statvfs::Mock(stat) => stat.blocks_available,
+        }
+    }
+
+    pub fn fragment_size(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => stat.fragment_size(),
+            Statvfs::Mock(stat) => stat.fragment_size,
+        }
+    }
+
+    pub fn block_size(&self) -> u64 {
+        match self {
+            Statvfs::Real(stat) => stat.block_size(),
+            Statvfs::Mock(stat) => stat.block_size,
+        }
+    }
+}
+
+pub mod mock {
+    use anyhow::Context;
+    use regex::Regex;
+    use std::path::Path;
+    use tracing::log::info;
+
+    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[serde(tag = "type")]
+    pub enum Behavior {
+        Success {
+            blocksize: u64,
+            total_blocks: u64,
+            name_filter: Option<utils::serde_regex::Regex>,
+        },
+        Failure {
+            mocked_error: MockedError,
+        },
+    }
+
+    #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+    #[allow(clippy::upper_case_acronyms)]
+    pub enum MockedError {
+        EIO,
+    }
+
+    impl From<MockedError> for nix::Error {
+        fn from(e: MockedError) -> Self {
+            match e {
+                MockedError::EIO => nix::Error::EIO,
+            }
+        }
+    }
+
+    pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
+        info!("running mocked statvfs");
+
+        match behavior {
+            Behavior::Success {
+                blocksize,
+                total_blocks,
+                ref name_filter,
+            } => {
+                let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
+
+                // round it up to the nearest block multiple
+                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
+
+                if used_blocks > *total_blocks {
+                    panic!(
+                        "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}"
+                    );
+                }
+
+                let avail_blocks = total_blocks - used_blocks;
+
+                Ok(Statvfs {
+                    blocks: *total_blocks,
+                    blocks_available: avail_blocks,
+                    fragment_size: *blocksize,
+                    block_size: *blocksize,
+                })
+            }
+            Behavior::Failure { mocked_error } => Err((*mocked_error).into()),
+        }
+    }
+
+    fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
+        let mut total = 0;
+        for entry in walkdir::WalkDir::new(path) {
+            let entry = entry?;
+            if !entry.file_type().is_file() {
+                continue;
+            }
+            if !name_filter
+                .as_ref()
+                .map(|filter| filter.is_match(entry.file_name().to_str().unwrap()))
+                .unwrap_or(true)
+            {
+                continue;
+            }
+            total += entry
+                .metadata()
+                .with_context(|| format!("get metadata of {:?}", entry.path()))?
+                .len();
+        }
+        Ok(total)
+    }
+
+    pub struct Statvfs {
+        pub blocks: u64,
+        pub blocks_available: u64,
+        pub fragment_size: u64,
+        pub block_size: u64,
+    }
+}
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -234,6 +234,9 @@ pub enum TaskKind {
    // Eviction. One per timeline.
    Eviction,

+    /// See [`crate::disk_usage_eviction_task`].
+    DiskUsageEviction,
+
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,7 +95,7 @@ mod timeline;

 pub mod size;

-pub use timeline::{PageReconstructError, Timeline};
+pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};

 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -1706,6 +1706,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

+    pub fn get_min_resident_size_override(&self) -> Option<u64> {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .min_resident_size_override
+            .or(self.conf.default_tenant_conf.min_resident_size_override)
+    }
+
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        *self.tenant_conf.write().unwrap() = new_tenant_conf;
    }
@@ -2783,6 +2790,7 @@ pub mod harness {
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
+                min_resident_size_override: tenant_conf.min_resident_size_override,
            }
        }
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -92,6 +92,7 @@ pub struct TenantConf {
    pub max_lsn_wal_lag: NonZeroU64,
    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -159,6 +160,10 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub min_resident_size_override: Option<u64>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -220,6 +225,9 @@ impl TenantConfOpt {
                .trace_read_requests
                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
+            min_resident_size_override: self
+                .min_resident_size_override
+                .or(global_conf.min_resident_size_override),
        }
    }
 }
@@ -251,6 +259,7 @@ impl Default for TenantConf {
                .expect("cannot parse default max walreceiver Lsn wal lag"),
            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -121,10 +121,10 @@ struct LayerAccessStatsInner {
 }

 #[derive(Debug, Clone, Copy)]
-pub(super) struct LayerAccessStatFullDetails {
-    pub(super) when: SystemTime,
-    pub(super) task_kind: TaskKind,
-    pub(super) access_kind: LayerAccessKind,
+pub(crate) struct LayerAccessStatFullDetails {
+    pub(crate) when: SystemTime,
+    pub(crate) task_kind: TaskKind,
+    pub(crate) access_kind: LayerAccessKind,
 }

 #[derive(Clone, Copy, strum_macros::EnumString)]
@@ -255,7 +255,7 @@ impl LayerAccessStats {
        ret
    }

-    pub(super) fn most_recent_access_or_residence_event(
+    fn most_recent_access_or_residence_event(
        &self,
    ) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
        let locked = self.0.lock().unwrap();
@@ -268,6 +268,13 @@ impl LayerAccessStats {
            }
        }
    }
+
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        match self.most_recent_access_or_residence_event() {
+            Either::Left(mra) => mra.when,
+            Either::Right(re) => re.timestamp,
+        }
+    }
 }

 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,6 +13,7 @@ use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState,
 };
+use remote_storage::GenericRemoteStorage;
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -957,6 +958,25 @@ impl Timeline {
        }
    }

+    /// Evict a batch of layers.
+    ///
+    /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
+    ///
+    /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
+    pub async fn evict_layers(
+        &self,
+        _: &GenericRemoteStorage,
+        layers_to_evict: &[Arc<dyn PersistentLayer>],
+        cancel: CancellationToken,
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+        let remote_client = self.remote_client.clone().expect(
+            "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
+        );
+
+        self.evict_layer_batch(&remote_client, layers_to_evict, cancel)
+            .await
+    }
+
    /// Evict multiple layers at once, continuing through errors.
    ///
    /// Try to evict the given `layers_to_evict` by
@@ -994,6 +1014,15 @@ impl Timeline {
        // now lock out layer removal (compaction, gc, timeline deletion)
        let layer_removal_guard = self.layer_removal_cs.lock().await;

+        {
+            // to avoid racing with detach and delete_timeline
+            let state = self.current_state();
+            anyhow::ensure!(
+                state == TimelineState::Active,
+                "timeline is not active but {state:?}"
+            );
+        }
+
        // start the batch update
        let mut layer_map = self.layers.write().unwrap();
        let mut batch_updates = layer_map.batch_update();
@@ -1027,6 +1056,8 @@ impl Timeline {
        use super::layer_map::Replacement;

        if local_layer.is_remote_layer() {
+            // TODO(issue #3851): consider returning an err here instead of false,
+            // which is the same out the match later
            return Ok(false);
        }

@@ -1096,6 +1127,9 @@ impl Timeline {
                    self.metrics
                        .evictions_with_low_residence_duration
                        .observe(delta);
+                    info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
+                } else {
+                    info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
                }

                true
@@ -4012,6 +4046,67 @@ impl Timeline {
    }
 }

+pub struct DiskUsageEvictionInfo {
+    /// Timeline's largest layer (remote or resident)
+    pub max_layer_size: Option<u64>,
+    /// Timeline's resident layers
+    pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
+}
+
+pub struct LocalLayerInfoForDiskUsageEviction {
+    pub layer: Arc<dyn PersistentLayer>,
+    pub last_activity_ts: SystemTime,
+}
+
+impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
+        // having to allocate a string to this is bad, but it will rarely be formatted
+        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
+        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
+        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
+            .field("layer", &self.layer)
+            .field("last_activity", &ts)
+            .finish()
+    }
+}
+
+impl LocalLayerInfoForDiskUsageEviction {
+    pub fn file_size(&self) -> u64 {
+        self.layer.file_size()
+    }
+}
+
+impl Timeline {
+    pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
+        let layers = self.layers.read().unwrap();
+
+        let mut max_layer_size: Option<u64> = None;
+        let mut resident_layers = Vec::new();
+
+        for l in layers.iter_historic_layers() {
+            let file_size = l.file_size();
+            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
+
+            if l.is_remote_layer() {
+                continue;
+            }
+
+            let last_activity_ts = l.access_stats().latest_activity();
+
+            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
+                layer: l,
+                last_activity_ts,
+            });
+        }
+
+        DiskUsageEvictionInfo {
+            max_layer_size,
+            resident_layers,
+        }
+    }
+}
+
 type TraversalPathItem = (
    ValueReconstructResult,
    Lsn,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -20,7 +20,6 @@ use std::{
    time::{Duration, SystemTime},
 };

-use either::Either;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};
@@ -185,13 +184,7 @@ impl Timeline {
                if hist_layer.is_remote_layer() {
                    continue;
                }
-                let last_activity_ts = match hist_layer
-                    .access_stats()
-                    .most_recent_access_or_residence_event()
-                {
-                    Either::Left(mra) => mra.when,
-                    Either::Right(re) => re.timestamp,
-                };
+                let last_activity_ts = hist_layer.access_stats().latest_activity();
                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
                    Err(_e) => {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -237,11 +237,7 @@ async fn connection_manager_loop_step(
        if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
            info!("Switching to new connection candidate: {new_candidate:?}");
            walreceiver_state
-                .change_connection(
-                    new_candidate.safekeeper_id,
-                    new_candidate.wal_source_connconf,
-                    ctx,
-                )
+                .change_connection(new_candidate, ctx)
                .await
        }
    }
@@ -346,6 +342,8 @@ struct WalConnection {
    started_at: NaiveDateTime,
    /// Current safekeeper pageserver is connected to for WAL streaming.
    sk_id: NodeId,
+    /// Availability zone of the safekeeper.
+    availability_zone: Option<String>,
    /// Status of the connection.
    status: WalConnectionStatus,
    /// WAL streaming task handle.
@@ -405,12 +403,7 @@ impl WalreceiverState {
    }

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
-    async fn change_connection(
-        &mut self,
-        new_sk_id: NodeId,
-        new_wal_source_connconf: PgConnectionConfig,
-        ctx: &RequestContext,
-    ) {
+    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
        self.drop_old_connection(true).await;

        let id = self.id;
@@ -424,7 +417,7 @@ impl WalreceiverState {
            async move {
                super::walreceiver_connection::handle_walreceiver_connection(
                    timeline,
-                    new_wal_source_connconf,
+                    new_sk.wal_source_connconf,
                    events_sender,
                    cancellation,
                    connect_timeout,
@@ -433,13 +426,16 @@ impl WalreceiverState {
                .await
                .context("walreceiver connection handling failure")
            }
-            .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
+            .instrument(
+                info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
+            )
        });

        let now = Utc::now().naive_utc();
        self.wal_connection = Some(WalConnection {
            started_at: now,
-            sk_id: new_sk_id,
+            sk_id: new_sk.safekeeper_id,
+            availability_zone: new_sk.availability_zone,
            status: WalConnectionStatus {
                is_connected: false,
                has_processed_wal: false,
@@ -546,6 +542,7 @@ impl WalreceiverState {
    /// * if connected safekeeper is not present, pick the candidate
    /// * if we haven't received any updates for some time, pick the candidate
    /// * if the candidate commit_lsn is much higher than the current one, pick the candidate
+    /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate
    /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
    ///
    /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
@@ -559,6 +556,7 @@ impl WalreceiverState {

                let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
                    self.select_connection_candidate(Some(connected_sk_node))?;
+                let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone();

                let now = Utc::now().naive_utc();
                if let Ok(latest_interaciton) =
@@ -569,6 +567,7 @@ impl WalreceiverState {
                        return Some(NewWalConnectionCandidate {
                            safekeeper_id: new_sk_id,
                            wal_source_connconf: new_wal_source_connconf,
+                            availability_zone: new_availability_zone,
                            reason: ReconnectReason::NoKeepAlives {
                                last_keep_alive: Some(
                                    existing_wal_connection.status.latest_connection_update,
@@ -594,6 +593,7 @@ impl WalreceiverState {
                                return Some(NewWalConnectionCandidate {
                                    safekeeper_id: new_sk_id,
                                    wal_source_connconf: new_wal_source_connconf,
+                                    availability_zone: new_availability_zone,
                                    reason: ReconnectReason::LaggingWal {
                                        current_commit_lsn,
                                        new_commit_lsn,
@@ -601,6 +601,20 @@ impl WalreceiverState {
                                    },
                                });
                            }
+                            // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver,
+                            // and the current one is not, switch to the new one.
+                            if self.availability_zone.is_some()
+                                && existing_wal_connection.availability_zone
+                                    != self.availability_zone
+                                && self.availability_zone == new_availability_zone
+                            {
+                                return Some(NewWalConnectionCandidate {
+                                    safekeeper_id: new_sk_id,
+                                    availability_zone: new_availability_zone,
+                                    wal_source_connconf: new_wal_source_connconf,
+                                    reason: ReconnectReason::SwitchAvailabilityZone,
+                                });
+                            }
                        }
                        None => debug!(
                            "Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
@@ -668,6 +682,7 @@ impl WalreceiverState {
                            return Some(NewWalConnectionCandidate {
                                safekeeper_id: new_sk_id,
                                wal_source_connconf: new_wal_source_connconf,
+                                availability_zone: new_availability_zone,
                                reason: ReconnectReason::NoWalTimeout {
                                    current_lsn,
                                    current_commit_lsn,
@@ -686,10 +701,11 @@ impl WalreceiverState {
                self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
            }
            None => {
-                let (new_sk_id, _, new_wal_source_connconf) =
+                let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
                    self.select_connection_candidate(None)?;
                return Some(NewWalConnectionCandidate {
                    safekeeper_id: new_sk_id,
+                    availability_zone: new_safekeeper_broker_data.availability_zone.clone(),
                    wal_source_connconf: new_wal_source_connconf,
                    reason: ReconnectReason::NoExistingConnection,
                });
@@ -794,6 +810,7 @@ impl WalreceiverState {
 struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
    wal_source_connconf: PgConnectionConfig,
+    availability_zone: Option<String>,
    // This field is used in `derive(Debug)` only.
    #[allow(dead_code)]
    reason: ReconnectReason,
@@ -808,6 +825,7 @@ enum ReconnectReason {
        new_commit_lsn: Lsn,
        threshold: NonZeroU64,
    },
+    SwitchAvailabilityZone,
    NoWalTimeout {
        current_lsn: Lsn,
        current_commit_lsn: Lsn,
@@ -873,6 +891,7 @@ mod tests {
                peer_horizon_lsn: 0,
                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
+                availability_zone: None,
            },
            latest_update,
        }
@@ -933,6 +952,7 @@ mod tests {
        state.wal_connection = Some(WalConnection {
            started_at: now,
            sk_id: connected_sk_id,
+            availability_zone: None,
            status: connection_status,
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
@@ -1095,6 +1115,7 @@ mod tests {
        state.wal_connection = Some(WalConnection {
            started_at: now,
            sk_id: connected_sk_id,
+            availability_zone: None,
            status: connection_status,
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
@@ -1160,6 +1181,7 @@ mod tests {
        state.wal_connection = Some(WalConnection {
            started_at: now,
            sk_id: NodeId(1),
+            availability_zone: None,
            status: connection_status,
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
@@ -1222,6 +1244,7 @@ mod tests {
        state.wal_connection = Some(WalConnection {
            started_at: now,
            sk_id: NodeId(1),
+            availability_zone: None,
            status: connection_status,
            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
            discovered_new_wal: Some(NewCommittedWAL {
@@ -1289,4 +1312,74 @@ mod tests {
            availability_zone: None,
        }
    }
+
+    #[tokio::test]
+    async fn switch_to_same_availability_zone() -> anyhow::Result<()> {
+        // Pageserver and one of safekeepers will be in the same availability zone
+        // and pageserver should prefer to connect to it.
+        let test_az = Some("test_az".to_owned());
+
+        let harness = TenantHarness::create("switch_to_same_availability_zone")?;
+        let mut state = dummy_state(&harness).await;
+        state.availability_zone = test_az.clone();
+        let current_lsn = Lsn(100_000).align();
+        let now = Utc::now().naive_utc();
+
+        let connected_sk_id = NodeId(0);
+
+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_processed_wal: true,
+            latest_connection_update: now,
+            latest_wal_update: now,
+            commit_lsn: Some(current_lsn),
+            streaming_lsn: Some(current_lsn),
+        };
+
+        state.wal_connection = Some(WalConnection {
+            started_at: now,
+            sk_id: connected_sk_id,
+            availability_zone: None,
+            status: connection_status,
+            connection_task: TaskHandle::spawn(move |sender, _| async move {
+                sender
+                    .send(TaskStateUpdate::Progress(connection_status))
+                    .ok();
+                Ok(())
+            }),
+            discovered_new_wal: None,
+        });
+
+        // We have another safekeeper with the same commit_lsn, and it have the same availability zone as
+        // the current pageserver.
+        let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
+        same_az_sk.timeline.availability_zone = test_az.clone();
+
+        state.wal_stream_candidates = HashMap::from([
+            (
+                connected_sk_id,
+                dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
+            ),
+            (NodeId(1), same_az_sk),
+        ]);
+
+        // We expect that pageserver will switch to the safekeeper in the same availability zone,
+        // even if it has the same commit_lsn.
+        let next_candidate = state.next_connection_candidate().expect(
+            "Expected one candidate selected out of multiple valid data options, but got none",
+        );
+
+        assert_eq!(next_candidate.safekeeper_id, NodeId(1));
+        assert_eq!(
+            next_candidate.reason,
+            ReconnectReason::SwitchAvailabilityZone,
+            "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn"
+        );
+        assert_eq!(
+            next_candidate.wal_source_connconf.host(),
+            &Host::Domain("same_az".to_owned())
+        );
+
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -37,7 +37,7 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use pq_proto::ReplicationFeedback;
+use pq_proto::PageserverFeedback;
 use utils::lsn::Lsn;

 /// Status of the connection.
@@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection(
                timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));

            // The last LSN we processed. It is not guaranteed to survive pageserver crash.
-            let write_lsn = u64::from(last_lsn);
+            let last_received_lsn = u64::from(last_lsn);
            // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
-            let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
+            let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
            // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
            // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
-            let apply_lsn = u64::from(timeline_remote_consistent_lsn);
+            let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
            let ts = SystemTime::now();

            // Update the status about what we just received. This is shown in the mgmt API.
@@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection(
            let (timeline_logical_size, _) = timeline
                .get_current_logical_size(&ctx)
                .context("Status update creation failed to get current logical size")?;
-            let status_update = ReplicationFeedback {
+            let status_update = PageserverFeedback {
                current_timeline_size: timeline_logical_size,
-                ps_writelsn: write_lsn,
-                ps_flushlsn: flush_lsn,
-                ps_applylsn: apply_lsn,
-                ps_replytime: ts,
+                last_received_lsn,
+                disk_consistent_lsn,
+                remote_consistent_lsn,
+                replytime: ts,
            };

            debug!("neon_status_update {status_update:?}");
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk)
 	return sk->state == SS_ACTIVE;
 }

-/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
+/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
 void
-ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf)
+ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf)
 {
 	uint8		nkeys;
 	int			i;
@@ -1892,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
 				 rf->currentClusterSize);
 		}
-		else if (strcmp(key, "ps_writelsn") == 0)
+		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_writelsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_writelsn));
+			rf->last_received_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
-		else if (strcmp(key, "ps_flushlsn") == 0)
+		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_flushlsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_flushlsn));
+			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
-		else if (strcmp(key, "ps_applylsn") == 0)
+		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_applylsn = pq_getmsgint64(reply_message);
-			elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
-				 LSN_FORMAT_ARGS(rf->ps_applylsn));
+			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
+			elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+				 LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
-		else if (strcmp(key, "ps_replytime") == 0)
+		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
-			rf->ps_replytime = pq_getmsgint64(reply_message);
+			rf->replytime = pq_getmsgint64(reply_message);
 			{
 				char	   *replyTimeStr;

 				/* Copy because timestamptz_to_str returns a static buffer */
-				replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
-				elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
-					 rf->ps_replytime, replyTimeStr);
+				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
+				elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+					 rf->replytime, replyTimeStr);

 				pfree(replyTimeStr);
 			}
@@ -1944,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
+			elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void)
 }

 /*
- * ReplicationFeedbackShmemSize --- report amount of shared memory space needed
+ * WalproposerShmemSize --- report amount of shared memory space needed
 */
 Size
 WalproposerShmemSize(void)
@@ -2054,10 +2054,10 @@ WalproposerShmemInit(void)
 }

 void
-replication_feedback_set(ReplicationFeedback * rf)
+replication_feedback_set(PageserverFeedback * rf)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
+	memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
 	SpinLockRelease(&walprop_shared->mutex);
 }

@@ -2065,43 +2065,43 @@ void
 replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
 {
 	SpinLockAcquire(&walprop_shared->mutex);
-	*writeLsn = walprop_shared->feedback.ps_writelsn;
-	*flushLsn = walprop_shared->feedback.ps_flushlsn;
-	*applyLsn = walprop_shared->feedback.ps_applylsn;
+	*writeLsn = walprop_shared->feedback.last_received_lsn;
+	*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
+	*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
 	SpinLockRelease(&walprop_shared->mutex);
 }

 /*
- * Get ReplicationFeedback fields from the most advanced safekeeper
+ * Get PageserverFeedback fields from the most advanced safekeeper
 */
 static void
-GetLatestNeonFeedback(ReplicationFeedback * rf)
+GetLatestNeonFeedback(PageserverFeedback * rf)
 {
 	int			latest_safekeeper = 0;
-	XLogRecPtr	ps_writelsn = InvalidXLogRecPtr;
+	XLogRecPtr	last_received_lsn = InvalidXLogRecPtr;

 	for (int i = 0; i < n_safekeepers; i++)
 	{
-		if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
+		if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
 		{
 			latest_safekeeper = i;
-			ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
+			last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn;
 		}
 	}

 	rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
-	rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
-	rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
-	rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
-	rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
+	rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
+	rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
+	rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
+	rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime;

 	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-		 " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
+		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
 		 rf->currentClusterSize,
-		 LSN_FORMAT_ARGS(rf->ps_writelsn),
-		 LSN_FORMAT_ARGS(rf->ps_flushlsn),
-		 LSN_FORMAT_ARGS(rf->ps_applylsn),
-		 rf->ps_replytime);
+		 LSN_FORMAT_ARGS(rf->last_received_lsn),
+		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+		 rf->replytime);

 	replication_feedback_set(rf);
 }
@@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void)
 	XLogRecPtr	minFlushLsn;

 	minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
-	diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
+	diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;

 	if (!syncSafekeepers)
 	{
-		/* Get ReplicationFeedback fields from the most advanced safekeeper */
+		/* Get PageserverFeedback fields from the most advanced safekeeper */
 		GetLatestNeonFeedback(&quorumFeedback.rf);
 		SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
 	}

-	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
+	if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
 	{

 		if (minQuorumLsn > quorumFeedback.flushLsn)
@@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void)
 			 * apply_lsn - This is what processed and durably saved at*
 			 * pageserver.
 			 */
-								quorumFeedback.rf.ps_flushlsn,
+								quorumFeedback.rf.disk_consistent_lsn,
 								GetCurrentTimestamp(), false);
 	}

@@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
 				msg->hs.xmin.value = pq_getmsgint64_le(&s);
 				msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
 				if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
-					ParseReplicationFeedbackMessage(&s, &msg->rf);
+					ParsePageserverFeedbackMessage(&s, &msg->rf);
 				pq_getmsgend(&s);
 				return true;
 			}
@@ -2462,7 +2462,7 @@ backpressure_lag_impl(void)
 		replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
 #define MB ((XLogRecPtr)1024 * 1024)

-		elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
+		elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
 			 LSN_FORMAT_ARGS(myFlushLsn),
 			 LSN_FORMAT_ARGS(writePtr),
 			 LSN_FORMAT_ARGS(flushPtr),
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback
 	FullTransactionId catalog_xmin;
 }			HotStandbyFeedback;

-typedef struct ReplicationFeedback
+typedef struct PageserverFeedback
 {
 	/* current size of the timeline on pageserver */
 	uint64		currentClusterSize;
 	/* standby_status_update fields that safekeeper received from pageserver */
-	XLogRecPtr	ps_writelsn;
-	XLogRecPtr	ps_flushlsn;
-	XLogRecPtr	ps_applylsn;
-	TimestampTz ps_replytime;
-}			ReplicationFeedback;
+	XLogRecPtr	last_received_lsn;
+	XLogRecPtr	disk_consistent_lsn;
+	XLogRecPtr	remote_consistent_lsn;
+	TimestampTz replytime;
+}			PageserverFeedback;

 typedef struct WalproposerShmemState
 {
 	slock_t		mutex;
-	ReplicationFeedback feedback;
+	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
 }			WalproposerShmemState;
@@ -320,10 +320,10 @@ typedef struct AppendResponse
 	/* Feedback recieved from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
-	ReplicationFeedback rf;
+	PageserverFeedback rf;
 }			AppendResponse;

-/*  ReplicationFeedback is extensible part of the message that is parsed separately */
+/*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)

@@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]);
 extern void WalProposerMain(Datum main_arg);
 extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
 extern void WalProposerPoll(void);
-extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
-											ReplicationFeedback *rf);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+											PageserverFeedback *rf);
 extern void StartProposerReplication(StartReplicationCmd *cmd);

 extern Size WalproposerShmemSize(void);
 extern bool WalproposerShmemInit(void);
-extern void replication_feedback_set(ReplicationFeedback *rf);
+extern void replication_feedback_set(PageserverFeedback *rf);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 /* libpqwalproposer hooks & helper type */
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]

 [[package]]
 name = "allure-pytest"
-version = "2.10.0"
+version = "2.13.1"
 description = "Allure pytest integration"
 category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"},
-    {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"},
+    {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"},
+    {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"},
 ]

 [package.dependencies]
-allure-python-commons = "2.10.0"
+allure-python-commons = "2.13.1"
 pytest = ">=4.5.0"
-six = ">=1.9.0"

 [[package]]
 name = "allure-python-commons"
-version = "2.10.0"
+version = "2.13.1"
 description = "Common module for integrate allure with python-based frameworks"
 category = "main"
 optional = false
-python-versions = ">=3.5"
+python-versions = ">=3.6"
 files = [
-    {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"},
-    {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
+    {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"},
+    {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"},
 ]

 [package.dependencies]
 attrs = ">=16.0.0"
 pluggy = ">=0.4.0"
-six = ">=1.9.0"

 [[package]]
 name = "async-timeout"
@@ -1932,6 +1930,22 @@ pytest = [
    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

+[[package]]
+name = "pytest-rerunfailures"
+version = "11.1.2"
+description = "pytest plugin to re-run tests to eliminate flaky failures"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
+    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+]
+
+[package.dependencies]
+packaging = ">=17.1"
+pytest = ">=5.3"
+
 [[package]]
 name = "pytest-timeout"
 version = "2.1.0"
@@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
+content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1"
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -140,7 +140,7 @@ async fn auth_quirks(

 impl BackendType<'_, ClientCredentials<'_>> {
    /// Authenticate the client via the requested backend, possibly using credentials.
-    #[tracing::instrument(fields(allow_cleartext), skip_all)]
+    #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
    pub async fn authenticate(
        &mut self,
        extra: &ConsoleReqExtra<'_>,
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -98,7 +98,7 @@ pub async fn task_main(
 }

 // TODO(tech debt): unite this with its twin below.
-#[tracing::instrument(fields(session_id), skip_all)]
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 pub async fn handle_ws_client(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -140,7 +140,7 @@ pub async fn handle_ws_client(
        .await
 }

-#[tracing::instrument(fields(session_id), skip_all)]
+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 async fn handle_client(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
 Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
-allure-pytest = "^2.10.0"
+allure-pytest = "^2.13.1"
 pytest-asyncio = "^0.19.0"
 toml = "^0.10.2"
 psutil = "^5.9.4"
@@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
 aiohttp = "3.7.4"
+pytest-rerunfailures = "^11.1.2"

 [tool.poetry.group.dev.dependencies]
 black = "^23.1.0"
@@ -69,6 +70,9 @@ strict = true
 module = [
    "asyncpg.*",
    "pg8000.*",
+    "allure.*",
+    "allure_commons.*",
+    "allure_pytest.*",
 ]
 ignore_missing_imports = true

--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -242,6 +242,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
        safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
        backup_lsn: sk_info.backup_lsn.0,
        local_start_lsn: sk_info.local_start_lsn.0,
+        availability_zone: None,
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -255,7 +255,7 @@ pub struct TimelineCollector {
    epoch_start_lsn: GenericGaugeVec<AtomicU64>,
    peer_horizon_lsn: GenericGaugeVec<AtomicU64>,
    remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
-    feedback_ps_write_lsn: GenericGaugeVec<AtomicU64>,
+    ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
    feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
    timeline_active: GenericGaugeVec<AtomicU64>,
    wal_backup_active: GenericGaugeVec<AtomicU64>,
@@ -339,15 +339,15 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(remote_consistent_lsn.desc().into_iter().cloned());

-        let feedback_ps_write_lsn = GenericGaugeVec::new(
+        let ps_last_received_lsn = GenericGaugeVec::new(
            Opts::new(
-                "safekeeper_feedback_ps_write_lsn",
+                "safekeeper_ps_last_received_lsn",
                "Last LSN received by the pageserver, acknowledged in the feedback",
            ),
            &["tenant_id", "timeline_id"],
        )
        .unwrap();
-        descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned());
+        descs.extend(ps_last_received_lsn.desc().into_iter().cloned());

        let feedback_last_time_seconds = GenericGaugeVec::new(
            Opts::new(
@@ -458,7 +458,7 @@ impl TimelineCollector {
            epoch_start_lsn,
            peer_horizon_lsn,
            remote_consistent_lsn,
-            feedback_ps_write_lsn,
+            ps_last_received_lsn,
            feedback_last_time_seconds,
            timeline_active,
            wal_backup_active,
@@ -489,7 +489,7 @@ impl Collector for TimelineCollector {
        self.epoch_start_lsn.reset();
        self.peer_horizon_lsn.reset();
        self.remote_consistent_lsn.reset();
-        self.feedback_ps_write_lsn.reset();
+        self.ps_last_received_lsn.reset();
        self.feedback_last_time_seconds.reset();
        self.timeline_active.reset();
        self.wal_backup_active.reset();
@@ -514,11 +514,11 @@ impl Collector for TimelineCollector {
            let timeline_id = tli.ttid.timeline_id.to_string();
            let labels = &[tenant_id.as_str(), timeline_id.as_str()];

-            let mut most_advanced: Option<pq_proto::ReplicationFeedback> = None;
+            let mut most_advanced: Option<pq_proto::PageserverFeedback> = None;
            for replica in tli.replicas.iter() {
                if let Some(replica_feedback) = replica.pageserver_feedback {
                    if let Some(current) = most_advanced {
-                        if current.ps_writelsn < replica_feedback.ps_writelsn {
+                        if current.last_received_lsn < replica_feedback.last_received_lsn {
                            most_advanced = Some(replica_feedback);
                        }
                    } else {
@@ -568,11 +568,10 @@ impl Collector for TimelineCollector {
                .set(tli.wal_storage.flush_wal_seconds);

            if let Some(feedback) = most_advanced {
-                self.feedback_ps_write_lsn
+                self.ps_last_received_lsn
                    .with_label_values(labels)
-                    .set(feedback.ps_writelsn);
-                if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH)
-                {
+                    .set(feedback.last_received_lsn);
+                if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) {
                    self.feedback_last_time_seconds
                        .with_label_values(labels)
                        .set(unix_time.as_secs());
@@ -599,7 +598,7 @@ impl Collector for TimelineCollector {
        mfs.extend(self.epoch_start_lsn.collect());
        mfs.extend(self.peer_horizon_lsn.collect());
        mfs.extend(self.remote_consistent_lsn.collect());
-        mfs.extend(self.feedback_ps_write_lsn.collect());
+        mfs.extend(self.ps_last_received_lsn.collect());
        mfs.extend(self.feedback_last_time_seconds.collect());
        mfs.extend(self.timeline_active.collect());
        mfs.extend(self.wal_backup_active.collect());
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -18,7 +18,7 @@ use crate::control_file;
 use crate::send_wal::HotStandbyFeedback;

 use crate::wal_storage;
-use pq_proto::{ReplicationFeedback, SystemId};
+use pq_proto::{PageserverFeedback, SystemId};
 use utils::{
    bin_ser::LeSer,
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -360,7 +360,7 @@ pub struct AppendResponse {
    // a criterion for walproposer --sync mode exit
    pub commit_lsn: Lsn,
    pub hs_feedback: HotStandbyFeedback,
-    pub pageserver_feedback: ReplicationFeedback,
+    pub pageserver_feedback: PageserverFeedback,
 }

 impl AppendResponse {
@@ -370,7 +370,7 @@ impl AppendResponse {
            flush_lsn: Lsn(0),
            commit_lsn: Lsn(0),
            hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: ReplicationFeedback::empty(),
+            pageserver_feedback: PageserverFeedback::empty(),
        }
    }
 }
@@ -708,7 +708,7 @@ where
            commit_lsn: self.state.commit_lsn,
            // will be filled by the upper code to avoid bothering safekeeper
            hs_feedback: HotStandbyFeedback::empty(),
-            pageserver_feedback: ReplicationFeedback::empty(),
+            pageserver_feedback: PageserverFeedback::empty(),
        };
        trace!("formed AppendResponse {:?}", ar);
        ar
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -11,7 +11,7 @@ use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
 use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
-use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
+use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};

@@ -319,11 +319,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                // pageserver sends this.
                // Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
                let buf = Bytes::copy_from_slice(&msg[9..]);
-                let reply = ReplicationFeedback::parse(buf);
+                let reply = PageserverFeedback::parse(buf);

-                trace!("ReplicationFeedback is {:?}", reply);
-                // Only pageserver sends ReplicationFeedback, so set the flag.
-                // This replica is the source of information to resend to compute.
+                trace!("PageserverFeedback is {:?}", reply);
                self.feedback.pageserver_feedback = Some(reply);

                self.tli
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,7 @@
 use anyhow::{anyhow, bail, Result};
 use parking_lot::{Mutex, MutexGuard};
 use postgres_ffi::XLogSegNo;
-use pq_proto::ReplicationFeedback;
+use pq_proto::PageserverFeedback;
 use serde::Serialize;
 use std::cmp::{max, min};
 use std::path::PathBuf;
@@ -91,7 +91,7 @@ pub struct ReplicaState {
    /// combined hot standby feedback from all replicas
    pub hs_feedback: HotStandbyFeedback,
    /// Replication specific feedback received from pageserver, if any
-    pub pageserver_feedback: Option<ReplicationFeedback>,
+    pub pageserver_feedback: Option<PageserverFeedback>,
 }

 impl Default for ReplicaState {
@@ -276,7 +276,7 @@ impl SharedState {
            //
            if let Some(pageserver_feedback) = state.pageserver_feedback {
                if let Some(acc_feedback) = acc.pageserver_feedback {
-                    if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn {
+                    if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn {
                        warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
                        acc.pageserver_feedback = Some(pageserver_feedback);
                    }
@@ -287,12 +287,12 @@ impl SharedState {
                // last lsn received by pageserver
                // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
                // See https://github.com/neondatabase/neon/issues/1171
-                acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn);
+                acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn);

                // When at least one pageserver has preserved data up to remote_consistent_lsn,
                // safekeeper is free to delete it, so choose max of all pageservers.
                acc.remote_consistent_lsn = max(
-                    Lsn::from(pageserver_feedback.ps_applylsn),
+                    Lsn::from(pageserver_feedback.remote_consistent_lsn),
                    acc.remote_consistent_lsn,
                );
            }
@@ -337,6 +337,7 @@ impl SharedState {
            safekeeper_connstr: conf.listen_pg_addr.clone(),
            backup_lsn: self.sk.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
+            availability_zone: conf.availability_zone.clone(),
        }
    }
 }
@@ -584,7 +585,7 @@ impl Timeline {
            let replica_state = shared_state.replicas[replica_id].unwrap();
            let reported_remote_consistent_lsn = replica_state
                .pageserver_feedback
-                .map(|f| Lsn(f.ps_applylsn))
+                .map(|f| Lsn(f.remote_consistent_lsn))
                .unwrap_or(Lsn::INVALID);
            let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
            (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -0,0 +1,87 @@
+#! /usr/bin/env python3
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from typing import DefaultDict, Dict
+
+import psycopg2
+import psycopg2.extras
+
+# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
+FLAKY_TESTS_QUERY = """
+    SELECT
+        DISTINCT parent_suite, suite, test
+    FROM
+        (
+            SELECT
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
+            FROM
+                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
+        ) data
+    WHERE
+        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        AND status::text IN ('"failed"', '"broken"')
+    ;
+"""
+
+
+def main(args: argparse.Namespace):
+    connstr = args.connstr
+    interval_days = args.days
+    output = args.output
+
+    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
+    res = defaultdict(lambda: defaultdict(dict))
+
+    logging.info("connecting to the database...")
+    with psycopg2.connect(connstr, connect_timeout=10) as conn:
+        with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            logging.info("fetching flaky tests...")
+            cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
+            rows = cur.fetchall()
+
+    for row in rows:
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True
+
+    logging.info(f"saving results to {output.name}")
+    json.dump(res, output, indent=2)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
+    parser.add_argument(
+        "--output",
+        type=argparse.FileType("w"),
+        default="flaky.json",
+        help="path to output json file (default: flaky.json)",
+    )
+    parser.add_argument(
+        "--days",
+        required=False,
+        default=10,
+        type=int,
+        help="how many days to look back for flaky tests (default: 10)",
+    )
+    parser.add_argument(
+        "connstr",
+        help="connection string to the test results database",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    logging.basicConfig(
+        format="%(message)s",
+        level=level,
+    )
+
+    main(args)
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -0,0 +1,125 @@
+//
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
+//
+// The comment is updated on each run with the latest results.
+//
+// It is designed to be used with actions/github-script from GitHub Workflows:
+// - uses: actions/github-script@v6
+//   with:
+//     script: |
+//       const script = require("./scripts/pr-comment-test-report.js")
+//       await script({
+//         github,
+//         context,
+//         fetch,
+//         reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
+//       })
+//
+
+module.exports = async ({ github, context, fetch, reports }) => {
+    // Marker to find the comment in the subsequent runs
+    const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
+    const githubActionsBotId = 41898282
+    // The latest commit in the PR URL
+    const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
+    // Commend body itself
+    let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
+
+    // Common parameters for GitHub API requests
+    const ownerRepoParams = {
+        owner: context.repo.owner,
+        repo: context.repo.repo,
+    }
+
+    for (const report of reports) {
+        const {buildType, reportUrl, jsonUrl} = report
+
+        if (!reportUrl || !jsonUrl) {
+            console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`)
+            continue
+        }
+
+        const suites = await (await fetch(jsonUrl)).json()
+
+        // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+        // For this report it's ok to treat them in the same way (as failed).
+        failedTests = []
+        passedTests = []
+        skippedTests = []
+
+        retriedTests = []
+        retriedStatusChangedTests = []
+
+        for (const parentSuite of suites.children) {
+            for (const suite of parentSuite.children) {
+                for (const test of suite.children) {
+                    pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
+                    test.pytestName = pytestName
+
+                    if (test.status === "passed") {
+                        passedTests.push(test);
+                    } else if (test.status === "failed" || test.status === "broken") {
+                        failedTests.push(test);
+                    } else if (test.status === "skipped") {
+                        skippedTests.push(test);
+                    }
+
+                    if (test.retriesCount > 0) {
+                        retriedTests.push(test);
+
+                        if (test.retriedStatusChangedTests) {
+                            retriedStatusChangedTests.push(test);
+                        }
+                    }
+                }
+            }
+        }
+
+        const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
+        commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n`
+        if (failedTests.length > 0) {
+            commentBody += `Failed tests:\n`
+            for (const test of failedTests) {
+                const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+
+                commentBody += `- [\`${test.pytestName}\`](${allureLink})`
+                if (test.retriesCount > 0) {
+                    commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+                }
+                commentBody += "\n"
+            }
+            commentBody += "\n"
+        }
+        if (retriedStatusChangedTests > 0) {
+            commentBody += `Flaky tests:\n`
+            for (const test of retriedStatusChangedTests) {
+                const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
+            }
+            commentBody += "\n"
+        }
+        commentBody += "___\n"
+    }
+
+    const { data: comments } = await github.rest.issues.listComments({
+        issue_number: context.payload.number,
+        ...ownerRepoParams,
+    })
+
+    const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
+    if (comment) {
+        await github.rest.issues.updateComment({
+            comment_id: comment.id,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    } else {
+        await github.rest.issues.createComment({
+            issue_number: context.payload.number,
+            body: commentBody,
+            ...ownerRepoParams,
+        })
+    }
+}
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -133,6 +133,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                peer_horizon_lsn: 5,
                safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
                local_start_lsn: 0,
+                availability_zone: None,
            };
            counter += 1;
            yield info;
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -36,9 +36,11 @@ message SafekeeperTimelineInfo {
    uint64 local_start_lsn = 9;
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
+    // Availability zone of a safekeeper.
+    optional string availability_zone = 11;
 }

 message TenantTimelineId {
    bytes tenant_id = 1;
    bytes timeline_id = 2;
-}
+}
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -525,6 +525,7 @@ mod tests {
            peer_horizon_lsn: 5,
            safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
            local_start_lsn: 0,
+            availability_zone: None,
        }
    }

--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -4,4 +4,5 @@ pytest_plugins = (
    "fixtures.pg_stats",
    "fixtures.compare_fixtures",
    "fixtures.slow",
+    "fixtures.flaky",
 )
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -114,7 +114,7 @@ class NeonCompare(PgCompare):
        self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant)

        # Start pg
-        self._pg = self.env.postgres.create_start(branch_name, "main", self.tenant)
+        self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant)

    @property
    def pg(self) -> PgProtocol:
--- a/test_runner/fixtures/flaky.py
+++ b/test_runner/fixtures/flaky.py
@@ -0,0 +1,58 @@
+import json
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.config import Config
+from _pytest.config.argparsing import Parser
+from allure_commons.types import LabelType
+from allure_pytest.utils import allure_name, allure_suite_labels
+
+from fixtures.log_helper import log
+
+"""
+The plugin reruns flaky tests.
+It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
+
+Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
+"""
+
+
+def pytest_addoption(parser: Parser):
+    parser.addoption(
+        "--flaky-tests-json",
+        action="store",
+        type=Path,
+        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
+    )
+
+
+def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
+    if not config.getoption("--flaky-tests-json"):
+        return
+
+    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
+    flaky_json = config.getoption("--flaky-tests-json")
+    if not flaky_json.exists():
+        return
+
+    content = flaky_json.read_text()
+    try:
+        flaky_tests = json.loads(content)
+    except ValueError:
+        log.error(f"Can't parse {content} as json")
+        return
+
+    for item in items:
+        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
+        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
+        allure_labels = dict(allure_suite_labels(item))
+        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
+        suite = str(allure_labels.get(LabelType.SUITE))
+        params = item.callspec.params if hasattr(item, "callspec") else {}
+        name = allure_name(item, params)
+
+        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
+            # Rerun 3 times = 1 original run + 2 reruns
+            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
+            item.add_marker(pytest.mark.flaky(reruns=2))
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -598,6 +598,7 @@ class NeonEnvBuilder:
        rust_log_override: Optional[str] = None,
        default_branch_name: str = DEFAULT_BRANCH_NAME,
        preserve_database_files: bool = False,
+        initial_tenant: Optional[TenantId] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
@@ -613,32 +614,39 @@ class NeonEnvBuilder:
        self.safekeepers_enable_fsync = safekeepers_enable_fsync
        self.auth_enabled = auth_enabled
        self.default_branch_name = default_branch_name
-        self.env: Optional[NeonEnvWithoutInitialTenant] = None
+        self.env: Optional[NeonEnv] = None
        self.remote_storage_prefix: Optional[str] = None
        self.keep_remote_storage_contents: bool = True
        self.neon_binpath = neon_binpath
        self.pg_distrib_dir = pg_distrib_dir
        self.pg_version = pg_version
        self.preserve_database_files = preserve_database_files
+        self.initial_tenant = initial_tenant or TenantId.generate()

-    def init_configs(self) -> NeonEnvWithoutInitialTenant:
+    def init_configs(self) -> NeonEnv:
        # Cannot create more than one environment from one builder
        assert self.env is None, "environment already initialized"
-        self.env = NeonEnvWithoutInitialTenant(self)
+        self.env = NeonEnv(self)
        return self.env

    def start(self):
        assert self.env is not None, "environment is not already initialized, call init() first"
        self.env.start()

-    def init_start_no_initial_tenant(self) -> NeonEnvWithoutInitialTenant:
-        self.env = self.init_configs()
-        self.start()
-        return self.env
-
    def init_start(self) -> NeonEnv:
-        env_without_initial_tenant = self.init_start_no_initial_tenant()
-        return NeonEnv(env_without_initial_tenant)
+        env = self.init_configs()
+        self.start()
+
+        # Prepare the default branch to start the postgres on later.
+        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
+        log.info(
+            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
+        )
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        env.initial_timeline = initial_timeline
+        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
+
+        return env

    def enable_remote_storage(
        self,
@@ -821,7 +829,7 @@ class NeonEnvBuilder:
        # Stop all the nodes.
        if self.env:
            log.info("Cleaning up all storage and compute nodes")
-            self.env.postgres.stop_all()
+            self.env.endpoints.stop_all()
            for sk in self.env.safekeepers:
                sk.stop(immediate=True)
            self.env.pageserver.stop(immediate=True)
@@ -846,7 +854,7 @@ class NeonEnvBuilder:
            self.env.pageserver.assert_no_errors()


-class NeonEnvWithoutInitialTenant:
+class NeonEnv:
    """
    An object representing the Neon runtime environment. It consists of
    the page server, 0-N safekeepers, and the compute nodes.
@@ -885,7 +893,7 @@ class NeonEnvWithoutInitialTenant:
        self.port_distributor = config.port_distributor
        self.s3_mock_server = config.mock_s3_server
        self.neon_cli = NeonCli(env=self)
-        self.postgres = PostgresFactory(self)
+        self.endpoints = EndpointFactory(self)
        self.safekeepers: List[Safekeeper] = []
        self.broker = config.broker
        self.remote_storage = config.remote_storage
@@ -893,14 +901,21 @@ class NeonEnvWithoutInitialTenant:
        self.pg_version = config.pg_version
        self.neon_binpath = config.neon_binpath
        self.pg_distrib_dir = config.pg_distrib_dir
+        self.endpoint_counter = 0

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
-        # self.initial_tenant: TenantId = TenantId.generate()
-        # self.initial_timeline: Optional[TimelineId] = None
+        self.initial_tenant = config.initial_tenant
+        self.initial_timeline: Optional[TimelineId] = None

        # Create a config file corresponding to the options
        toml = textwrap.dedent(
+            f"""
+            default_tenant_id = '{config.initial_tenant}'
+        """
+        )
+
+        toml += textwrap.dedent(
            f"""
            [broker]
            listen_addr = '{self.broker.listen_addr()}'
@@ -1000,31 +1015,12 @@ class NeonEnvWithoutInitialTenant:
        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
        return AuthKeys(pub=pub, priv=priv)

-
-class NeonEnv(NeonEnvWithoutInitialTenant):
-    """Wrapper class around NeonEnvWithoutInitialTenant that provides a default tenant & timeline"""
-
-    initial_tenant: TenantId
-    initial_timeline: TimelineId
-
-    def __init__(self, baseObject: NeonEnvWithoutInitialTenant):
-        # https://stackoverflow.com/a/1445289
-        self.__class__ = type(
-            baseObject.__class__.__name__, (self.__class__, baseObject.__class__), {}
-        )
-        self.__dict__ = baseObject.__dict__
-
-        # Prepare the default branch to start the postgres on later.
-        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
-        initial_tenant = TenantId.generate()
-        log.info(f"Creating initial tenant {initial_tenant} and its initial timeline")
-        initial_tenant2, initial_timeline = self.neon_cli.create_tenant(
-            tenant_id=initial_tenant, set_default=True
-        )
-        assert initial_tenant == initial_tenant2
-        self.initial_tenant = initial_tenant
-        self.initial_timeline = initial_timeline
-        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
+    def generate_endpoint_id(self) -> str:
+        """
+        Generate a unique endpoint ID
+        """
+        self.endpoint_counter += 1
+        return "ep-" + str(self.endpoint_counter)


@pytest.fixture(scope=shareable_scope)
@@ -1084,7 +1080,7 @@ def neon_simple_env(_shared_simple_env: NeonEnv) -> Iterator[NeonEnv]:
    """
    yield _shared_simple_env

-    _shared_simple_env.postgres.stop_all()
+    _shared_simple_env.endpoints.stop_all()


@pytest.fixture(scope="function")
@@ -1108,7 +1104,7 @@ def neon_env_builder(
    neon_env_builder.init_start().

    After the initialization, you can launch compute nodes by calling
-    the functions in the 'env.postgres' factory object, stop/start the
+    the functions in the 'env.endpoints' factory object, stop/start the
    nodes, etc.
    """

@@ -1232,6 +1228,28 @@ class PageserverHttpClient(requests.Session):
        self.verbose_error(res)
        return TenantConfig.from_json(res.json())

+    def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
+        assert "tenant_id" not in config.keys()
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**config, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def patch_tenant_config_client_side(
+        self,
+        tenant_id: TenantId,
+        inserts: Optional[Dict[str, Any]] = None,
+        removes: Optional[List[str]] = None,
+    ):
+        current = self.tenant_config(tenant_id).tenant_specific_overrides
+        if inserts is not None:
+            current.update(inserts)
+        if removes is not None:
+            for key in removes:
+                del current[key]
+        self.set_tenant_config(tenant_id, current)
+
    def tenant_size(self, tenant_id: TenantId) -> int:
        return self.tenant_size_and_modelinputs(tenant_id)[0]

@@ -1548,6 +1566,18 @@ class PageserverHttpClient(requests.Session):
        for layer in info.historic_layers:
            self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)

+    def disk_usage_eviction_run(self, request: dict[str, Any]):
+        res = self.put(
+            f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
+            json=request,
+        )
+        self.verbose_error(res)
+        return res.json()
+
+    def tenant_break(self, tenant_id: TenantId):
+        res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
+        self.verbose_error(res)
+

@dataclass
 class TenantConfig:
@@ -1649,7 +1679,7 @@ class AbstractNeonCli(abc.ABC):
    Do not use directly, use specific subclasses instead.
    """

-    def __init__(self, env: NeonEnvWithoutInitialTenant):
+    def __init__(self, env: NeonEnv):
        self.env = env

    COMMAND: str = cast(str, None)  # To be overwritten by the derived class.
@@ -1808,7 +1838,8 @@ class NeonCli(AbstractNeonCli):
            "create",
            "--branch-name",
            new_branch_name,
-            *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
+            "--tenant-id",
+            str(tenant_id or self.env.initial_tenant),
            "--pg-version",
            self.env.pg_version,
        ]
@@ -1836,7 +1867,8 @@ class NeonCli(AbstractNeonCli):
            "branch",
            "--branch-name",
            new_branch_name,
-            *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
+            "--tenant-id",
+            str(tenant_id or self.env.initial_tenant),
        ]
        if ancestor_branch_name is not None:
            cmd.extend(["--ancestor-branch-name", ancestor_branch_name])
@@ -1865,11 +1897,7 @@ class NeonCli(AbstractNeonCli):
        # main [b49f7954224a0ad25cc0013ea107b54b]
        # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540]
        res = self.raw_cli(
-            [
-                "timeline",
-                "list",
-                *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
-            ]
+            ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)]
        )
        timelines_cli = sorted(
            map(
@@ -1949,18 +1977,19 @@ class NeonCli(AbstractNeonCli):
            args.extend(["-m", "immediate"])
        return self.raw_cli(args)

-    def pg_create(
+    def endpoint_create(
        self,
        branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        port: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
-            "pg",
+            "endpoint",
            "create",
-            *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
+            "--tenant-id",
+            str(tenant_id or self.env.initial_tenant),
            "--branch-name",
            branch_name,
            "--pg-version",
@@ -1970,24 +1999,25 @@ class NeonCli(AbstractNeonCli):
            args.extend(["--lsn", str(lsn)])
        if port is not None:
            args.extend(["--port", str(port)])
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)

        res = self.raw_cli(args)
        res.check_returncode()
        return res

-    def pg_start(
+    def endpoint_start(
        self,
-        node_name: str,
+        endpoint_id: str,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        port: Optional[int] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
-            "pg",
+            "endpoint",
            "start",
-            *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
+            "--tenant-id",
+            str(tenant_id or self.env.initial_tenant),
            "--pg-version",
            self.env.pg_version,
        ]
@@ -1995,29 +2025,30 @@ class NeonCli(AbstractNeonCli):
            args.append(f"--lsn={lsn}")
        if port is not None:
            args.append(f"--port={port}")
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)

        res = self.raw_cli(args)
        res.check_returncode()
        return res

-    def pg_stop(
+    def endpoint_stop(
        self,
-        node_name: str,
+        endpoint_id: str,
        tenant_id: Optional[TenantId] = None,
        destroy=False,
        check_return_code=True,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
-            "pg",
+            "endpoint",
            "stop",
-            *(["--tenant-id", str(tenant_id)] if tenant_id is not None else []),
+            "--tenant-id",
+            str(tenant_id or self.env.initial_tenant),
        ]
        if destroy:
            args.append("--destroy")
-        if node_name is not None:
-            args.append(node_name)
+        if endpoint_id is not None:
+            args.append(endpoint_id)

        return self.raw_cli(args, check_return_code=check_return_code)

@@ -2061,12 +2092,7 @@ class NeonPageserver(PgProtocol):

    TEMP_FILE_SUFFIX = "___temp"

-    def __init__(
-        self,
-        env: NeonEnvWithoutInitialTenant,
-        port: PageserverPort,
-        config_override: Optional[str] = None,
-    ):
+    def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional[str] = None):
        super().__init__(host="localhost", port=port.pg, user="cloud_admin")
        self.env = env
        self.running = False
@@ -2674,48 +2700,50 @@ def static_proxy(
        yield proxy


-class Postgres(PgProtocol):
-    """An object representing a running postgres daemon."""
+class Endpoint(PgProtocol):
+    """An object representing a Postgres compute endpoint managed by the control plane."""

    def __init__(
-        self,
-        env: NeonEnvWithoutInitialTenant,
-        tenant_id: Optional[TenantId],
-        port: int,
-        check_stop_result: bool = True,
+        self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True
    ):
        super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
        self.env = env
        self.running = False
-        self.node_name: Optional[str] = None  # dubious, see asserts below
-        self.pgdata_dir: Optional[Path] = None  # Path to computenode PGDATA
+        self.endpoint_id: Optional[str] = None  # dubious, see asserts below
+        self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
        self.tenant_id = tenant_id
        self.port = port
        self.check_stop_result = check_stop_result
-        # path to conf is <repo_dir>/pgdatadirs/tenants/<tenant_id>/<node_name>/postgresql.conf
+        # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf

    def create(
        self,
        branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-    ) -> "Postgres":
+    ) -> "Endpoint":
        """
-        Create the pg data directory.
+        Create a new Postgres endpoint.
        Returns self.
        """

        if not config_lines:
            config_lines = []

-        self.node_name = node_name or f"{branch_name}_pg_node"
-        output = self.env.neon_cli.pg_create(
-            branch_name, node_name=self.node_name, tenant_id=self.tenant_id, lsn=lsn, port=self.port
+        if endpoint_id is None:
+            endpoint_id = self.env.generate_endpoint_id()
+        self.endpoint_id = endpoint_id
+
+        self.env.neon_cli.endpoint_create(
+            branch_name,
+            endpoint_id=self.endpoint_id,
+            tenant_id=self.tenant_id,
+            lsn=lsn,
+            port=self.port,
        )
-        self.pgdata_dir = Path(output.stdout.strip())
-        assert self.pgdata_dir.is_dir()
-        assert Path(self.config_file_path()).is_file()
+        path = Path("endpoints") / self.endpoint_id / "pgdata"
+        self.pgdata_dir = os.path.join(self.env.repo_dir, path)

        if config_lines is None:
            config_lines = []
@@ -2727,24 +2755,30 @@ class Postgres(PgProtocol):

        return self

-    def start(self) -> "Postgres":
+    def start(self) -> "Endpoint":
        """
        Start the Postgres instance.
        Returns self.
        """

-        assert self.node_name is not None
+        assert self.endpoint_id is not None

-        log.info(f"Starting postgres node {self.node_name}")
+        log.info(f"Starting postgres endpoint {self.endpoint_id}")

-        self.env.neon_cli.pg_start(self.node_name, tenant_id=self.tenant_id, port=self.port)
+        self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port)
        self.running = True

        return self

+    def endpoint_path(self) -> Path:
+        """Path to endpoint directory"""
+        assert self.endpoint_id
+        path = Path("endpoints") / self.endpoint_id
+        return self.env.repo_dir / path
+
    def pg_data_dir_path(self) -> str:
-        """Path to data directory"""
-        return str(self.pgdata_dir)
+        """Path to Postgres data directory"""
+        return os.path.join(self.endpoint_path(), "pgdata")

    def pg_xact_dir_path(self) -> str:
        """Path to pg_xact dir"""
@@ -2758,7 +2792,7 @@ class Postgres(PgProtocol):
        """Path to postgresql.conf"""
        return os.path.join(self.pg_data_dir_path(), "postgresql.conf")

-    def adjust_for_safekeepers(self, safekeepers: str) -> "Postgres":
+    def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint":
        """
        Adjust instance config for working with wal acceptors instead of
        pageserver (pre-configured by CLI) directly.
@@ -2782,7 +2816,7 @@ class Postgres(PgProtocol):
            f.write("neon.safekeepers = '{}'\n".format(safekeepers))
        return self

-    def config(self, lines: List[str]) -> "Postgres":
+    def config(self, lines: List[str]) -> "Endpoint":
        """
        Add lines to postgresql.conf.
        Lines should be an array of valid postgresql.conf rows.
@@ -2796,32 +2830,32 @@ class Postgres(PgProtocol):

        return self

-    def stop(self) -> "Postgres":
+    def stop(self) -> "Endpoint":
        """
        Stop the Postgres instance if it's running.
        Returns self.
        """

        if self.running:
-            assert self.node_name is not None
-            self.env.neon_cli.pg_stop(
-                self.node_name, self.tenant_id, check_return_code=self.check_stop_result
+            assert self.endpoint_id is not None
+            self.env.neon_cli.endpoint_stop(
+                self.endpoint_id, self.tenant_id, check_return_code=self.check_stop_result
            )
            self.running = False

        return self

-    def stop_and_destroy(self) -> "Postgres":
+    def stop_and_destroy(self) -> "Endpoint":
        """
-        Stop the Postgres instance, then destroy it.
+        Stop the Postgres instance, then destroy the endpoint.
        Returns self.
        """

-        assert self.node_name is not None
-        self.env.neon_cli.pg_stop(
-            self.node_name, self.tenant_id, True, check_return_code=self.check_stop_result
+        assert self.endpoint_id is not None
+        self.env.neon_cli.endpoint_stop(
+            self.endpoint_id, self.tenant_id, True, check_return_code=self.check_stop_result
        )
-        self.node_name = None
+        self.endpoint_id = None
        self.running = False

        return self
@@ -2829,13 +2863,12 @@ class Postgres(PgProtocol):
    def create_start(
        self,
        branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-    ) -> "Postgres":
+    ) -> "Endpoint":
        """
-        Create a Postgres instance, apply config
-        and then start it.
+        Create an endpoint, apply config, and start Postgres.
        Returns self.
        """

@@ -2843,7 +2876,7 @@ class Postgres(PgProtocol):

        self.create(
            branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
            config_lines=config_lines,
            lsn=lsn,
        ).start()
@@ -2852,7 +2885,7 @@ class Postgres(PgProtocol):

        return self

-    def __enter__(self) -> "Postgres":
+    def __enter__(self) -> "Endpoint":
        return self

    def __exit__(
@@ -2864,33 +2897,33 @@ class Postgres(PgProtocol):
        self.stop()


-class PostgresFactory:
-    """An object representing multiple running postgres daemons."""
+class EndpointFactory:
+    """An object representing multiple compute endpoints."""

-    def __init__(self, env: NeonEnvWithoutInitialTenant):
+    def __init__(self, env: NeonEnv):
        self.env = env
        self.num_instances: int = 0
-        self.instances: List[Postgres] = []
+        self.endpoints: List[Endpoint] = []

    def create_start(
        self,
        branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-    ) -> Postgres:
-        pg = Postgres(
+    ) -> Endpoint:
+        ep = Endpoint(
            self.env,
-            tenant_id=tenant_id,
+            tenant_id=tenant_id or self.env.initial_tenant,
            port=self.env.port_distributor.get_port(),
        )
        self.num_instances += 1
-        self.instances.append(pg)
+        self.endpoints.append(ep)

-        return pg.create_start(
+        return ep.create_start(
            branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
            config_lines=config_lines,
            lsn=lsn,
        )
@@ -2898,30 +2931,33 @@ class PostgresFactory:
    def create(
        self,
        branch_name: str,
-        node_name: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-    ) -> Postgres:
-        pg = Postgres(
+    ) -> Endpoint:
+        ep = Endpoint(
            self.env,
-            tenant_id=tenant_id,
+            tenant_id=tenant_id or self.env.initial_tenant,
            port=self.env.port_distributor.get_port(),
        )

-        self.num_instances += 1
-        self.instances.append(pg)
+        if endpoint_id is None:
+            endpoint_id = self.env.generate_endpoint_id()

-        return pg.create(
+        self.num_instances += 1
+        self.endpoints.append(ep)
+
+        return ep.create(
            branch_name=branch_name,
-            node_name=node_name,
+            endpoint_id=endpoint_id,
            lsn=lsn,
            config_lines=config_lines,
        )

-    def stop_all(self) -> "PostgresFactory":
-        for pg in self.instances:
-            pg.stop()
+    def stop_all(self) -> "EndpointFactory":
+        for ep in self.endpoints:
+            ep.stop()

        return self

@@ -2936,7 +2972,7 @@ class SafekeeperPort:
 class Safekeeper:
    """An object representing a running safekeeper daemon."""

-    env: NeonEnvWithoutInitialTenant
+    env: NeonEnv
    port: SafekeeperPort
    id: int
    running: bool = False
@@ -3296,17 +3332,16 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:
 def check_restored_datadir_content(
    test_output_dir: Path,
    env: NeonEnv,
-    pg: Postgres,
+    endpoint: Endpoint,
 ):
    # Get the timeline ID. We need it for the 'basebackup' command
-    tenant = TenantId(pg.safe_psql("SHOW neon.tenant_id")[0][0])
-    timeline = TimelineId(pg.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

    # stop postgres to ensure that files won't change
-    pg.stop()
+    endpoint.stop()

    # Take a basebackup from pageserver
-    restored_dir_path = env.repo_dir / f"{pg.node_name}_restored_datadir"
+    restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
    restored_dir_path.mkdir(exist_ok=True)

    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
@@ -3316,7 +3351,7 @@ def check_restored_datadir_content(
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {tenant} {timeline}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
         | tar -x -C {restored_dir_path}
    """

@@ -3333,8 +3368,8 @@ def check_restored_datadir_content(
    assert result.returncode == 0

    # list files we're going to compare
-    assert pg.pgdata_dir
-    pgdata_files = list_files_to_compare(Path(pg.pgdata_dir))
+    assert endpoint.pgdata_dir
+    pgdata_files = list_files_to_compare(Path(endpoint.pgdata_dir))
    restored_files = list_files_to_compare(restored_dir_path)

    # check that file sets are equal
@@ -3345,12 +3380,12 @@ def check_restored_datadir_content(
    # We've already filtered all mismatching files in list_files_to_compare(),
    # so here expect that the content is identical
    (match, mismatch, error) = filecmp.cmpfiles(
-        pg.pgdata_dir, restored_dir_path, pgdata_files, shallow=False
+        endpoint.pgdata_dir, restored_dir_path, pgdata_files, shallow=False
    )
    log.info(f"filecmp result mismatch and error lists:\n\t mismatch={mismatch}\n\t error={error}")

    for f in mismatch:
-        f1 = os.path.join(pg.pgdata_dir, f)
+        f1 = os.path.join(endpoint.pgdata_dir, f)
        f2 = os.path.join(restored_dir_path, f)
        stdout_filename = "{}.filediff".format(f2)

@@ -3510,24 +3545,24 @@ def wait_for_last_record_lsn(


 def wait_for_last_flush_lsn(
-    env: NeonEnvWithoutInitialTenant, pg: Postgres, tenant: TenantId, timeline: TimelineId
+    env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
-    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)


 def wait_for_wal_insert_lsn(
-    env: NeonEnvWithoutInitialTenant, pg: Postgres, tenant: TenantId, timeline: TimelineId
+    env: NeonEnv, endpoint: Endpoint, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
-    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
+    last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)


 def fork_at_current_lsn(
-    env: NeonEnvWithoutInitialTenant,
-    pg: Postgres,
+    env: NeonEnv,
+    endpoint: Endpoint,
    new_branch_name: str,
    ancestor_branch_name: str,
    tenant_id: Optional[TenantId] = None,
@@ -3537,7 +3572,7 @@ def fork_at_current_lsn(
    The "last LSN" is taken from the given Postgres instance. The pageserver will wait for all the
    the WAL up to that LSN to arrive in the pageserver before creating the branch.
    """
-    current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
+    current_lsn = endpoint.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
    return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)


--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -7,7 +7,7 @@ import time
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Tuple, TypeVar

-import allure  # type: ignore
+import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -52,13 +52,13 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int)
    def run_pgbench(branch: str):
        log.info(f"Start a pgbench workload on branch {branch}")

-        pg = env.postgres.create_start(branch, tenant_id=tenant)
-        connstr = pg.connstr()
+        endpoint = env.endpoints.create_start(branch, tenant_id=tenant)
+        connstr = endpoint.connstr()

        pg_bin.run_capture(["pgbench", "-i", connstr])
        pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr])

-        pg.stop()
+        endpoint.stop()

    env.neon_cli.create_branch("b0", tenant_id=tenant)

@@ -96,8 +96,8 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int):

    env.neon_cli.create_branch("b0")

-    pg = env.postgres.create_start("b0")
-    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", pg.connstr()])
+    endpoint = env.endpoints.create_start("b0")
+    neon_compare.pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])

    branch_creation_durations = []

@@ -124,15 +124,15 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare):

    timeline_id = env.neon_cli.create_branch("root")

-    pg = env.postgres.create_start("root")
-    with closing(pg.connect()) as conn:
+    endpoint = env.endpoints.create_start("root")
+    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
            for i in range(10000):
                cur.execute(f"CREATE TABLE t{i} as SELECT g FROM generate_series(1, 1000) g")

    # Wait for the pageserver to finish processing all the pending WALs,
    # as we don't want the LSN wait time to be included during the branch creation
-    flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
    wait_for_last_record_lsn(
        env.pageserver.http_client(), env.initial_tenant, timeline_id, flush_lsn
    )
@@ -142,7 +142,7 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare):

    # run a concurrent insertion to make the ancestor "busy" during the branch creation
    thread = threading.Thread(
-        target=pg.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",)
+        target=endpoint.safe_psql, args=("INSERT INTO t0 VALUES (generate_series(1, 100000))",)
    )
    thread.start()

--- a/test_runner/performance/test_branching.py
+++ b/test_runner/performance/test_branching.py
@@ -42,41 +42,41 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
        neon_compare.zenbenchmark.record_pg_bench_result(branch, res)

    env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
-    pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"])
+    endpoint_root = env.endpoints.create_start("root")
+    pg_bin.run_capture(["pgbench", "-i", endpoint_root.connstr(), "-s10"])

-    fork_at_current_lsn(env, pg_root, "child", "root")
+    fork_at_current_lsn(env, endpoint_root, "child", "root")

-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")

-    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()])
-    run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", pg_child.connstr()])
+    run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", endpoint_root.connstr()])
+    run_pgbench_on_branch("child", ["pgbench", "-c10", "-T10", endpoint_child.connstr()])


 def test_compare_child_and_root_write_perf(neon_compare: NeonCompare):
    env = neon_compare.env
    env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
+    endpoint_root = env.endpoints.create_start("root")

-    pg_root.safe_psql(
+    endpoint_root.safe_psql(
        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
    )

    env.neon_cli.create_branch("child", "root")
-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")

    with neon_compare.record_duration("root_run_duration"):
-        pg_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
+        endpoint_root.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
    with neon_compare.record_duration("child_run_duration"):
-        pg_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")
+        endpoint_child.safe_psql("INSERT INTO foo SELECT FROM generate_series(1,1000000)")


 def test_compare_child_and_root_read_perf(neon_compare: NeonCompare):
    env = neon_compare.env
    env.neon_cli.create_branch("root")
-    pg_root = env.postgres.create_start("root")
+    endpoint_root = env.endpoints.create_start("root")

-    pg_root.safe_psql_many(
+    endpoint_root.safe_psql_many(
        [
            "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
            "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
@@ -84,12 +84,12 @@ def test_compare_child_and_root_read_perf(neon_compare: NeonCompare):
    )

    env.neon_cli.create_branch("child", "root")
-    pg_child = env.postgres.create_start("child")
+    endpoint_child = env.endpoints.create_start("child")

    with neon_compare.record_duration("root_run_duration"):
-        pg_root.safe_psql("SELECT count(*) from foo")
+        endpoint_root.safe_psql("SELECT count(*) from foo")
    with neon_compare.record_duration("child_run_duration"):
-        pg_child.safe_psql("SELECT count(*) from foo")
+        endpoint_child.safe_psql("SELECT count(*) from foo")


 # -----------------------------------------------------------------------
--- a/test_runner/performance/test_bulk_tenant_create.py
+++ b/test_runner/performance/test_bulk_tenant_create.py
@@ -35,14 +35,14 @@ def test_bulk_tenant_create(
        # if use_safekeepers == 'with_sa':
        #    wa_factory.start_n_new(3)

-        pg_tenant = env.postgres.create_start(
+        endpoint_tenant = env.endpoints.create_start(
            f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant
        )

        end = timeit.default_timer()
        time_slices.append(end - start)

-        pg_tenant.stop()
+        endpoint_tenant.stop()

    zenbenchmark.record(
        "tenant_creation_time",
--- a/test_runner/performance/test_bulk_update.py
+++ b/test_runner/performance/test_bulk_update.py
@@ -18,8 +18,8 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)

    timeline_id = env.neon_cli.create_branch("test_bulk_update")
    tenant_id = env.initial_tenant
-    pg = env.postgres.create_start("test_bulk_update")
-    cur = pg.connect().cursor()
+    endpoint = env.endpoints.create_start("test_bulk_update")
+    cur = endpoint.connect().cursor()
    cur.execute("set statement_timeout=0")

    cur.execute(f"create table t(x integer) WITH (fillfactor={fillfactor})")
@@ -28,13 +28,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
        cur.execute(f"insert into t values (generate_series(1,{n_records}))")

    cur.execute("vacuum t")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    with zenbenchmark.record_duration("update-no-prefetch"):
        cur.execute("update t set x=x+1")

    cur.execute("vacuum t")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    with zenbenchmark.record_duration("delete-no-prefetch"):
        cur.execute("delete from t")
@@ -50,13 +50,13 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
        cur.execute(f"insert into t2 values (generate_series(1,{n_records}))")

    cur.execute("vacuum t2")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    with zenbenchmark.record_duration("update-with-prefetch"):
        cur.execute("update t2 set x=x+1")

    cur.execute("vacuum t2")
-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    with zenbenchmark.record_duration("delete-with-prefetch"):
        cur.execute("delete from t2")
--- a/test_runner/performance/test_compaction.py
+++ b/test_runner/performance/test_compaction.py
@@ -33,11 +33,11 @@ def test_compaction(neon_compare: NeonCompare):

    # Create some tables, and run a bunch of INSERTs and UPDATes on them,
    # to generate WAL and layers
-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
        "main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
    )

-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
            for i in range(100):
                cur.execute(f"create table tbl{i} (i int, j int);")
@@ -45,7 +45,7 @@ def test_compaction(neon_compare: NeonCompare):
                for j in range(100):
                    cur.execute(f"update tbl{i} set j = {j};")

-    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)

    # First compaction generates L1 layers
    with neon_compare.zenbenchmark.record_duration("compaction"):
--- a/test_runner/performance/test_latency.py
+++ b/test_runner/performance/test_latency.py
@@ -2,13 +2,13 @@ import threading

 import pytest
 from fixtures.compare_fixtures import PgCompare
-from fixtures.neon_fixtures import Postgres
+from fixtures.neon_fixtures import PgProtocol

 from performance.test_perf_pgbench import get_scales_matrix
 from performance.test_wal_backpressure import record_read_latency


-def start_write_workload(pg: Postgres, scale: int = 10):
+def start_write_workload(pg: PgProtocol, scale: int = 10):
    with pg.connect().cursor() as cur:
        cur.execute(f"create table big as select generate_series(1,{scale*100_000})")

--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -25,8 +25,8 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
    )

    env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    pg = env.postgres.create_start("test_layer_map", tenant_id=tenant)
-    cur = pg.connect().cursor()
+    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    cur = endpoint.connect().cursor()
    cur.execute("create table t(x integer)")
    for i in range(n_iters):
        cur.execute(f"insert into t values (generate_series(1,{n_records}))")
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -14,19 +14,19 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
    # Start
    env.neon_cli.create_branch("test_startup")
    with zenbenchmark.record_duration("startup_time"):
-        pg = env.postgres.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint = env.endpoints.create_start("test_startup")
+        endpoint.safe_psql("select 1;")

    # Restart
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
    with zenbenchmark.record_duration("restart_time"):
-        pg.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")

    # Fill up
    num_rows = 1000000  # 30 MB
    num_tables = 100
-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
            for i in range(num_tables):
                cur.execute(f"create table t_{i} (i integer);")
@@ -34,18 +34,18 @@ def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker

    # Read
    with zenbenchmark.record_duration("read_time"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")

    # Read again
    with zenbenchmark.record_duration("second_read_time"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")

    # Restart
-    pg.stop_and_destroy()
+    endpoint.stop_and_destroy()
    with zenbenchmark.record_duration("restart_with_data"):
-        pg.create_start("test_startup")
-        pg.safe_psql("select 1;")
+        endpoint.create_start("test_startup")
+        endpoint.safe_psql("select 1;")

    # Read
    with zenbenchmark.record_duration("read_after_restart"):
-        pg.safe_psql("select * from t_0;")
+        endpoint.safe_psql("select * from t_0;")
--- a/test_runner/regress/test_ancestor_branch.py
+++ b/test_runner/regress/test_ancestor_branch.py
@@ -22,8 +22,8 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):

    pageserver_http.configure_failpoints(("flush-frozen-before-sync", "sleep(10000)"))

-    pg_branch0 = env.postgres.create_start("main", tenant_id=tenant)
-    branch0_cur = pg_branch0.connect().cursor()
+    endpoint_branch0 = env.endpoints.create_start("main", tenant_id=tenant)
+    branch0_cur = endpoint_branch0.connect().cursor()
    branch0_timeline = TimelineId(query_scalar(branch0_cur, "SHOW neon.timeline_id"))
    log.info(f"b0 timeline {branch0_timeline}")

@@ -44,10 +44,10 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):

    # Create branch1.
    env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100)
-    pg_branch1 = env.postgres.create_start("branch1", tenant_id=tenant)
+    endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant)
    log.info("postgres is running on 'branch1' branch")

-    branch1_cur = pg_branch1.connect().cursor()
+    branch1_cur = endpoint_branch1.connect().cursor()
    branch1_timeline = TimelineId(query_scalar(branch1_cur, "SHOW neon.timeline_id"))
    log.info(f"b1 timeline {branch1_timeline}")

@@ -67,9 +67,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):

    # Create branch2.
    env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200)
-    pg_branch2 = env.postgres.create_start("branch2", tenant_id=tenant)
+    endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant)
    log.info("postgres is running on 'branch2' branch")
-    branch2_cur = pg_branch2.connect().cursor()
+    branch2_cur = endpoint_branch2.connect().cursor()

    branch2_timeline = TimelineId(query_scalar(branch2_cur, "SHOW neon.timeline_id"))
    log.info(f"b2 timeline {branch2_timeline}")
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -63,9 +63,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):

    branch = "test_compute_auth_to_pageserver"
    env.neon_cli.create_branch(branch)
-    pg = env.postgres.create_start(branch)
+    endpoint = env.endpoints.create_start(branch)

-    with closing(pg.connect()) as conn:
+    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
            # we rely upon autocommit after each statement
            # as waiting for acceptors happens there
@@ -82,7 +82,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    branch = f"test_auth_failures_auth_enabled_{auth_enabled}"
    timeline_id = env.neon_cli.create_branch(branch)
-    env.postgres.create_start(branch)
+    env.endpoints.create_start(branch)

    tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
    invalid_tenant_token = env.auth_keys.generate_tenant_token(TenantId.generate())
--- a/test_runner/regress/test_backpressure.py
+++ b/test_runner/regress/test_backpressure.py
@@ -5,7 +5,7 @@ from contextlib import closing, contextmanager
 import psycopg2.extras
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, Postgres
+from fixtures.neon_fixtures import Endpoint, NeonEnvBuilder

 pytest_plugins = "fixtures.neon_fixtures"

@@ -20,10 +20,10 @@ def pg_cur(pg):
 # Periodically check that all backpressure lags are below the configured threshold,
 # assert if they are not.
 # If the check query fails, stop the thread. Main thread should notice that and stop the test.
-def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interval=5):
+def check_backpressure(endpoint: Endpoint, stop_event: threading.Event, polling_interval=5):
    log.info("checks started")

-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
        cur.execute("CREATE EXTENSION neon")  # TODO move it to neon_fixtures?

        cur.execute("select pg_size_bytes(current_setting('max_replication_write_lag'))")
@@ -41,7 +41,7 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv
        max_replication_apply_lag_bytes = res[0]
        log.info(f"max_replication_apply_lag: {max_replication_apply_lag_bytes} bytes")

-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
        while not stop_event.is_set():
            try:
                cur.execute(
@@ -102,14 +102,14 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
    # Create a branch for us
    env.neon_cli.create_branch("test_backpressure")

-    pg = env.postgres.create_start(
+    endpoint = env.endpoints.create_start(
        "test_backpressure", config_lines=["max_replication_write_lag=30MB"]
    )
    log.info("postgres is running on 'test_backpressure' branch")

    # setup check thread
    check_stop_event = threading.Event()
-    check_thread = threading.Thread(target=check_backpressure, args=(pg, check_stop_event))
+    check_thread = threading.Thread(target=check_backpressure, args=(endpoint, check_stop_event))
    check_thread.start()

    # Configure failpoint to slow down walreceiver ingest
@@ -125,7 +125,7 @@ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder):
    # because of the lag and waiting for lsn to replay to arrive.
    time.sleep(2)

-    with pg_cur(pg) as cur:
+    with pg_cur(endpoint) as cur:
        # Create and initialize test table
        cur.execute("CREATE TABLE foo(x bigint)")

--- a/test_runner/regress/test_basebackup_error.py
+++ b/test_runner/regress/test_basebackup_error.py
@@ -15,4 +15,4 @@ def test_basebackup_error(neon_simple_env: NeonEnv):
    pageserver_http.configure_failpoints(("basebackup-before-control-file", "return"))

    with pytest.raises(Exception, match="basebackup-before-control-file"):
-        env.postgres.create_start("test_basebackup_error")
+        env.endpoints.create_start("test_basebackup_error")
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -67,9 +67,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
    )

    timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant)
-    pg_main = env.postgres.create_start("test_main", tenant_id=tenant)
+    endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant)

-    main_cur = pg_main.connect().cursor()
+    main_cur = endpoint_main.connect().cursor()

    main_cur.execute(
        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
@@ -90,9 +90,9 @@ def test_branch_and_gc(neon_simple_env: NeonEnv):
    env.neon_cli.create_branch(
        "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1
    )
-    pg_branch = env.postgres.create_start("test_branch", tenant_id=tenant)
+    endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant)

-    branch_cur = pg_branch.connect().cursor()
+    branch_cur = endpoint_branch.connect().cursor()
    branch_cur.execute("INSERT INTO foo SELECT FROM generate_series(1, 100000)")

    assert query_scalar(branch_cur, "SELECT count(*) FROM foo") == 200000
@@ -142,8 +142,8 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    )

    b0 = env.neon_cli.create_branch("b0", tenant_id=tenant)
-    pg0 = env.postgres.create_start("b0", tenant_id=tenant)
-    res = pg0.safe_psql_many(
+    endpoint0 = env.endpoints.create_start("b0", tenant_id=tenant)
+    res = endpoint0.safe_psql_many(
        queries=[
            "CREATE TABLE t(key serial primary key)",
            "INSERT INTO t SELECT FROM generate_series(1, 100000)",
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -18,10 +18,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):

    # Branch at the point where only 100 rows were inserted
    env.neon_cli.create_branch("test_branch_behind")
-    pgmain = env.postgres.create_start("test_branch_behind")
+    endpoint_main = env.endpoints.create_start("test_branch_behind")
    log.info("postgres is running on 'test_branch_behind' branch")

-    main_cur = pgmain.connect().cursor()
+    main_cur = endpoint_main.connect().cursor()

    timeline = TimelineId(query_scalar(main_cur, "SHOW neon.timeline_id"))

@@ -74,15 +74,15 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
        "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b
    )

-    pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
-    pg_more = env.postgres.create_start("test_branch_behind_more")
+    endpoint_hundred = env.endpoints.create_start("test_branch_behind_hundred")
+    endpoint_more = env.endpoints.create_start("test_branch_behind_more")

    # On the 'hundred' branch, we should see only 100 rows
-    hundred_cur = pg_hundred.connect().cursor()
+    hundred_cur = endpoint_hundred.connect().cursor()
    assert query_scalar(hundred_cur, "SELECT count(*) FROM foo") == 100

    # On the 'more' branch, we should see 100200 rows
-    more_cur = pg_more.connect().cursor()
+    more_cur = endpoint_more.connect().cursor()
    assert query_scalar(more_cur, "SELECT count(*) FROM foo") == 200100

    # All the rows are visible on the main branch
@@ -94,8 +94,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.create_branch(
        "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000")
    )
-    pg = env.postgres.create_start("test_branch_segment_boundary")
-    assert pg.safe_psql("SELECT 1")[0][0] == 1
+    endpoint = env.endpoints.create_start("test_branch_segment_boundary")
+    assert endpoint.safe_psql("SELECT 1")[0][0] == 1

    # branch at pre-initdb lsn
    with pytest.raises(Exception, match="invalid branch start lsn: .*"):
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alexey Kondratov	89cc2c517a	Polish API handler and refresh OpenAPI spec	2023-04-05 23:35:35 +03:00
Alexey Kondratov	7cdf703345	Use Condvar and make configuration API blocking	2023-04-05 23:19:10 +03:00
Alexey Kondratov	70383087be	Allow starting `compute_ctl` without spec With this commit one can start compute with something like ```shell cargo run --bin compute_ctl -- -i no-compute \ -p http://localhost:9095 \ -D compute_pgdata \ -C "postgresql://cloud_admin@127.0.0.1:5434/postgres" \ -b ./pg_install/v15/bin/postgres ``` and it will hang waiting for spec. Then send one spec ```shell curl -d "$(cat ./compute-spec.json)" http://localhost:3080/spec ``` Postgres will be started and configured. Then reconfigure it with ```shell curl -d "$(cat ./compute-spec-new.json)" http://localhost:3080/spec ``` Most of safeguards and comments are added. Some polishing especially around HTTP API is still needed.	2023-04-05 22:09:43 +03:00
Alexey Kondratov	66dd3f8ca5	Implement live reconfiguration in the `compute_ctl` Accept spec in JSON format and request compute reconfiguration from the configurator thread. If anything goes wrong after we set the compute state to `ConfigurationPending` and / or sent spec to the configurator thread, we basically leave compute in the potentially wrong state. That said, it's control-plane's responsibility to watch compute state after reconfiguration request and to clean restart it in case of errors. It still lacks ability of starting up without spec and some validations, i.e. that live reconfiguration should be only available with `--compute-id` and `--control-plane-uri` options. Otherwise, it works fine and could be tested by running `compute_ctl` locally, then sending it a new spec: ```shell curl -d "$(cat ./compute-spec-new.json)" http://localhost:3080/spec ``` We have one configurator thread and async http server, so generally we have single consumer - multiple producers pattern here. That's why we use `mpsc` channel, not `tokio::sync::watch`. Actually, concurrency of producers is limited to one due to code logic, but we still need an ability to potentially pass `Sender` to several threads. Next, we use async `hyper` + `tokio` http server, but all the other code is completely synchronous. So we need to send data from async to sync, that's why we use `mpsc::unbounded_channel` here, not `mpsc::channel`. It doesn't make much sense to rewrite all code to async now, but we can consider doing this in the future. I think that a combination of `Mutex` and `CondVar` would work just fine too, but as we already have `tokio`, I decided to try something from it.	2023-04-05 21:31:44 +03:00
Heikki Linnakangas	1f2946af17	try to fix tests	2023-04-05 20:04:14 +03:00
Heikki Linnakangas	2735f1c41e	Rename "Postgres nodes" in control_plane to endpoints. We use the term "endpoint" in for compute Postgres nodes in the web UI and user-facing documentation now. Adjust the nomenclature in the code. This changes the name of the "neon_local pg" command to "neon_local endpoint". Also adjust names of classes, variables etc. in the python tests accordingly. This also changes the directory structure so that endpoints are now stored in: .neon/endpoints/<endpoint id> instead of: .neon/pgdatadirs/tenants/<tenant_id>/<endpoint (node) name> The tenant ID is no longer part of the path. That means that you cannot have two endpoints with the same name/ID in two different tenants anymore. That's consistent with how we treat endpoints in the real control plane and proxy: the endpoint ID must be globally unique.	2023-04-05 19:49:25 +03:00
Heikki Linnakangas	8e06018dae	Move compute_ctl structs used in HTTP API and spec file to separate crate. This is in preparation of using compute_ctl to launch postgres nodes in the neon_local control plane. And seems like a good idea to separate the public interfaces anyway. One non-mechanical change here is that we now use a RwLock rather than atomics to protect the ComputeNode::metrics field. We were not using atomics for performance but for convenience here, and an RwLock is now more convenient.	2023-04-05 19:49:08 +03:00
Alexander Bayandin	957acb51b5	GitHub Autocomment: Fix the link to the latest commit (#3952 )	2023-04-04 19:06:10 +03:00
Alexander Bayandin	1d23b5d1de	Comment PR with test results (#3907 ) This PR adds posting a comment with test results. Each workflow run updates the comment with new results. The layout and the information that we post can be changed to our needs, right now, it contains failed tests and test which changes status after rerun (i.e. flaky tests)	2023-04-04 12:22:47 +01:00
Alexander Bayandin	105b8bb9d3	test_runner: automatically rerun flaky tests (#3880 ) This PR adds a plugin that automatically reruns (up to 3 times) flaky tests. Internally, it uses data from `TEST_RESULT_CONNSTR` database and `pytest-rerunfailures` plugin. As the first approximation we consider the test flaky if it has failed on the main branch in the last 10 days. Flaky tests are fetched by `scripts/flaky_tests.py` script (it's possible to use it in a standalone mode to learn which tests are flaky), stored to a JSON file, and then the file is passed to the pytest plugin.	2023-04-04 12:21:54 +01:00
Kirill Bulatov	846532112c	Remove unused S3 list operation (#3936 ) In S3, pageserver only lists tenants (prefixes) on S3, no other keys. Remove the list operation from the API, since S3 impl does not seem to work normally and not used anyway,	2023-04-03 23:44:38 +03:00
Dmitry Ivanov	f85a61ceac	[proxy] Fix regression in logging For some reason, `tracing::instrument` proc_macro doesn't always print elements specified via `fields()` or even show that it's impossible (e.g. there's no Display impl). Work around this using the `?foo` notation. Before: 2023-04-03T14:48:06.017504Z INFO handle_client🤝 received SslRequest After: 2023-04-03T14:51:24.424176Z INFO handle_client{session_id=7bd07be8-3462-404e-8ccc-0a5332bf3ace}🤝 received SslRequest	2023-04-03 18:49:30 +03:00
Christian Schwarz	45bf76eb05	enable layer eviction by default in prod (#3933 ) Leave disk_usage_based_eviction above the current max usage in prod (82%ish), so that deploying this commit won't trigger disk_usage_based_eviction. As indicated in the TODO, we'll decrease the value to 80% later. Also update the staging YAMLs to use the anchor syntax for `evictions_low_residence_duration_metric_threshold` like we do in the prod YAMLs as of this patch.	2023-04-03 14:57:36 +02:00
Joonas Koivunen	a415670bc3	feat: log evictions (#3930 ) this will help log analysis with the counterpart of already logging all remote download needs and downloads. ended up with a easily regexable output in the final round.	2023-04-03 14:15:41 +03:00
Joonas Koivunen	cf5cfe6d71	fix: metric used for alerting threshold on staging (#3932 ) This should remove the too eager alerts from staging.	2023-04-03 13:26:45 +03:00
Arseny Sher	d733bc54b8	Rename ReplicationFeedback and its fields. This is the the feedback originating from pageserver, so change previous confusing names to s/ReplicationFeedback/PageserverFeedback s/ps_writelsn/last_receive_lsn s/ps_flushlsn/disk_consistent_lsn s/ps_apply_lsn/remote_consistent_lsn I haven't changed on the wire format to keep compatibility. However, understanding of new field names is added to compute, so once all computes receive this patch we can change the wire names as well. Safekeepers/pageservers are deployed roughly at the same time and it is ok to live without feedbacks during the short period, so this is not a problem there.	2023-04-03 01:52:41 +04:00
Arthur Petukhovsky	814abd9f84	Switch to safekeeper in the same AZ (#3883 ) Add a condition to switch walreceiver connection to safekeeper that is located in the same availability zone. Switch happens when commit_lsn of a candidate is not less than commit_lsn from the active connection. This condition is expected not to trigger instantly, because commit_lsn of a current connection is usually greater than commit_lsn of updates from the broker. That means that if WAL is written continuously, switch can take a lot of time, but it should happen eventually. Now protoc 3.15+ is required for building neon. Fixes https://github.com/neondatabase/neon/issues/3200	2023-04-02 11:32:27 +03:00
Alexander Bayandin	75ffe34b17	check-macos-build: fix cache key (#3926 ) We don't have `${{ matrix.build_type }}` there, so it gets resolved to an empty substring and looks like this [`v1-macOS--pg-f8a650e49b06d39ad131b860117504044b01f312-dcccd010ff851b9f72bb451f28243fa3a341f07028034bbb46ea802413b36d80`](https://github.com/neondatabase/neon/actions/runs/4575422427/jobs/8078231907#step:26:2)	2023-03-31 21:45:59 +03:00
Christian Schwarz	d2aa31f0ce	fix pageserver_evictions_with_low_residence_duration metric (#3925 ) It was doing the comparison in the wrong way.	2023-03-31 19:25:53 +03:00
Dmitry Rodionov	22f9ea5fe2	Remind people to clean up merge commit message in PR template (#3920 )	2023-03-31 16:11:34 +03:00
Joonas Koivunen	d0711d0896	build: fix git perms for deploy job (#3921 ) copy pasted from `build-neon` job. it is interesting that this is only needed by `build-neon` and `deploy`. Fixes: https://github.com/neondatabase/neon/actions/runs/4568077915/jobs/8070960178 which seems to have been going for a while.	2023-03-31 16:05:15 +03:00
Arseny Sher	271f6a6e99	Always sync-safekeepers in neon_local on compute start. Instead of checking neon.safekeepers GUC value in existing pg node data dir, just always run sync-safekeepers when safekeepers are configured. Without this change, creation of new compute didn't run it. That's ok for new timeline/branch (it doesn't return anything useful anyway, and LSN is known by pageserver), but restart of compute for existing timeline bore the risk of getting basebackup not on the latest LSN, i.e. basically broken -- it might not have prev_lsn, and even if it had, walproposer would complain anyway. fixes https://github.com/neondatabase/neon/issues/2963	2023-03-31 16:15:06 +04:00
Christian Schwarz	a64dd3ecb5	disk-usage-based layer eviction (#3809 ) This patch adds a pageserver-global background loop that evicts layers in response to a shortage of available bytes in the $repo/tenants directory's filesystem. The loop runs periodically at a configurable `period`. Each loop iteration uses `statvfs` to determine filesystem-level space usage. It compares the returned usage data against two different types of thresholds. The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. We're good if that second statvfs shows that we're _actually_ below the configured thresholds. If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. There are two thresholds: - `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. If the actual usage is higher, the threshold is exceeded. - `min_avail_bytes` is the absolute available space in bytes. If the actual usage is lower, the threshold is exceeded. The iteration evicts layers in LRU fashion with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant. The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds. The `tenant_min_resident_size` can be overridden per tenant as `min_resident_size_override` (bytes). In addition to the loop, there is also an HTTP endpoint to perform one loop iteration synchronous to the request. The endpoint takes an absolute number of bytes that the iteration needs to evict before pressure is relieved. The tests use this endpoint, which is a great simplification over setting up loopback-mounts in the tests, which would be required to test the statvfs part of the implementation. We will rely on manual testing in staging to test the statvfs parts. The HTTP endpoint is also handy in emergencies where an operator wants the pageserver to evict a given amount of space _now. Hence, it's arguments documented in openapi_spec.yml. The response type isn't documented though because we don't consider it stable. The endpoint should _not_ be used by Console but it could be used by on-call. Co-authored-by: Joonas Koivunen <joonas@neon.tech> Co-authored-by: Dmitry Rodionov <dmitry@neon.tech> Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2023-03-31 14:47:57 +03:00