Merge pull request #8117 from neondatabase/rc/proxy/2024-06-20

Proxy release 2024-06-20
CI: miscellaneous cleanups (#8073 )
2026-02-04 19:20:36 +00:00 · 2024-06-20 11:42:35 +01:00 · 2024-06-19 19:21:09 +01:00 · 2024-06-19 16:54:07 +00:00 · 2024-06-19 16:07:14 +00:00 · 2024-06-19 15:05:31 +02:00
189 changed files with 6253 additions and 2413 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -8,6 +8,7 @@
 !scripts/combine_control_files.py
 !scripts/ninstall.sh
 !vm-cgconfig.conf
+!docker-compose/run-tests.sh

 # Directories
 !.cargo/
@@ -20,7 +21,7 @@
 !patches/
 !pgxn/
 !proxy/
-!s3_scrubber/
+!storage_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -183,8 +183,7 @@ runs:

        # Run the tests.
        #
-        # The junit.xml file allows CI tools to display more fine-grained test information
-        # in its "Tests" tab in the results page.
+        # --alluredir saves test results in Allure format (in a specified directory)
        # --verbose prints name of each test (helpful when there are
        # multiple tests in one file)
        # -rA prints summary in the end
@@ -193,7 +192,6 @@ runs:
        #
        mkdir -p $TEST_OUTPUT/allure/results
        "${cov_prefix[@]}" ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
          --alluredir=$TEST_OUTPUT/allure/results \
          --tb=short \
          --verbose \
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -36,15 +36,16 @@ jobs:
          fail_on_error: true
          filter_mode: nofilter
          level: error
-      - run: |
+
+      - name: Disallow 'ubuntu-latest' runners
+        run: |
          PAT='^\s*runs-on:.*-latest'
-          if grep -ERq $PAT .github/workflows
-          then
+          if grep -ERq $PAT .github/workflows; then
            grep -ERl $PAT .github/workflows |\
            while read -r f
            do
              l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
-              echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
+              echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
            done
            exit 1
          fi
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,7 @@ jobs:
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,14 +410,14 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

-    - name: Benchmark pgvector hnsw queries
+    - name: Benchmark pgvector queries
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
-        test_selection: performance
+        test_selection: performance/test_perf_pgvector_queries.py
        run_in_parallel: false
        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+        extra_params: -m remote_cluster --timeout 21600 
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -30,7 +30,6 @@ jobs:
  check-image:
    uses: ./.github/workflows/check-build-tools-image.yml

-  # This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
  build-image:
    needs: [ check-image ]
    if: needs.check-image.outputs.found == 'false'
@@ -55,7 +54,7 @@ jobs:
            exit 1
          fi

-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4

      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
      # The default value is ~/.docker
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -299,21 +299,21 @@ jobs:
        uses: actions/cache@v4
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v4
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v4
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -337,34 +337,8 @@ jobs:
        run: |
          ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests

-      - name: Run rust tests
-        env:
-          NEXTEST_RETRIES: 3
-        run: |
-          #nextest does not yet support running doctests
-          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
-
-          for io_engine in std-fs tokio-epoll-uring ; do
-            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
-          done
-
-          # Run separate tests for real S3
-          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
-          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
-          export REMOTE_STORAGE_S3_REGION=eu-central-1
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_s3)'
-
-          # Run separate tests for real Azure Blob Storage
-          # XXX: replace region with `eu-central-1`-like region
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
-          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
-          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
-          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo nextest run $CARGO_FLAGS -E 'package(remote_storage)' -E 'test(test_real_azure)'
-
+      # Do install *before* running rust tests because they might recompile the
+      # binaries with different features/flags.
      - name: Install rust binaries
        run: |
          # Install target binaries
@@ -405,6 +379,32 @@ jobs:
            done
          fi

+      - name: Run rust tests
+        env:
+          NEXTEST_RETRIES: 3
+        run: |
+          #nextest does not yet support running doctests
+          cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
+
+          for io_engine in std-fs tokio-epoll-uring ; do
+            NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
+          done
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
+
      - name: Install postgres binaries
        run: cp -a pg_install /tmp/neon/pg_install

@@ -859,6 +859,26 @@ jobs:
          tags: |
            neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}

+      - name: Build neon extensions test image
+        if: matrix.version == 'v16'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+            PG_VERSION=${{ matrix.version }}
+            BUILD_TAG=${{ needs.tag.outputs.build-tag }}
+            TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
+          provenance: false
+          push: true
+          pull: true
+          file: Dockerfile.compute-node
+          target: neon-pg-ext-test
+          cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
+          cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max
+          tags: |
+            neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
+
      - name: Build compute-tools image
        # compute-tools are Postgres independent, so build it only once
        if: matrix.version == 'v16'
@@ -902,6 +922,13 @@ jobs:
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
                                             neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64

+      - name: Create multi-arch neon-test-extensions image
+        if: matrix.version == 'v16'
+        run: |
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \
+                                             neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64
+
      - name: Create multi-arch compute-tools image
        if: matrix.version == 'v16'
        run: |
@@ -938,7 +965,7 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -996,6 +1023,18 @@ jobs:
        with:
          fetch-depth: 0

+      # Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
+      # The default value is ~/.docker
+      - name: Set custom docker config directory
+        run: |
+          mkdir -p .docker-custom
+          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
+
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
      # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
      # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
      # Regular pageserver version string looks like
@@ -1020,7 +1059,7 @@ jobs:
            exit 1
          fi

-      - name: Verify docker-compose example
+      - name: Verify docker-compose example and test extensions
        timeout-minutes: 20
        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh

@@ -1030,6 +1069,11 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml logs || 0
          docker compose -f ./docker-compose/docker-compose.yml down

+      - name: Remove custom docker config directory
+        if: always()
+        run: |
+          rm -rf .docker-custom
+
  promote-images:
    needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
    runs-on: ubuntu-22.04
@@ -1043,7 +1087,8 @@ jobs:
          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

-      - uses: docker/login-action@v3
+      - name: Login to dev ECR
+        uses: docker/login-action@v3
        with:
          registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
          username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -1074,6 +1119,24 @@ jobs:
                                                 $repo/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
            done
          done
+          docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
+                                             neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
+
+      - name: Login to prod ECR
+        uses: docker/login-action@v3
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        with:
+          registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
+          username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
+          password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
+
+      - name: Copy all images to prod ECR
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        run: |
+          for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
+            docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
+                                               369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
+          done

  trigger-custom-extensions-build-and-wait:
    needs: [ check-permissions, tag ]
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -25,26 +25,17 @@ jobs:
      found: ${{ steps.check-image.outputs.found }}

    steps:
+      - uses: actions/checkout@v4
+
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_TAG: |
+            ${{ hashFiles('Dockerfile.build-tools',
+                          '.github/workflows/check-build-tools-image.yml',
+                          '.github/workflows/build-build-tools-image.yml') }}
        run: |
-          LAST_BUILD_TOOLS_SHA=$(
-            gh api \
-              -H "Accept: application/vnd.github+json" \
-              -H "X-GitHub-Api-Version: 2022-11-28" \
-              --method GET \
-              --field path=Dockerfile.build-tools \
-              --field sha=${COMMIT_SHA} \
-              --field per_page=1 \
-              --jq ".[0].sha" \
-              "/repos/${GITHUB_REPOSITORY}/commits"
-          )
-          echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
+          echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT

      - name: Check if such tag found in the registry
        id: check-image
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -52,13 +52,15 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
+        TITLE="Storage & Compute release ${RELEASE_DATE}"
+
        cat << EOF > body.md
-          ## Storage & Compute release ${RELEASE_DATE}
+          ## ${TITLE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release"
@@ -91,13 +93,15 @@ jobs:
      env:
        GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
      run: |
+        TITLE="Proxy release ${RELEASE_DATE}"
+
        cat << EOF > body.md
-          ## Proxy release ${RELEASE_DATE}
+          ## ${TITLE}

          **Please merge this Pull Request using 'Create a merge commit' button**
        EOF

-        gh pr create --title "Proxy release ${RELEASE_DATE}" \
+        gh pr create --title "${TITLE}" \
                     --body-file "body.md" \
                     --head "${RELEASE_BRANCH}" \
                     --base "release-proxy"
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5109,54 +5109,6 @@ version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"

-[[package]]
-name = "s3_scrubber"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "async-stream",
- "aws-config",
- "aws-sdk-s3",
- "aws-smithy-async",
- "bincode",
- "bytes",
- "camino",
- "chrono",
- "clap",
- "crc32c",
- "either",
- "futures",
- "futures-util",
- "hex",
- "histogram",
- "humantime",
- "itertools",
- "once_cell",
- "pageserver",
- "pageserver_api",
- "postgres_ffi",
- "rand 0.8.5",
- "remote_storage",
- "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "serde",
- "serde_json",
- "serde_with",
- "thiserror",
- "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
- "tokio-stream",
- "tokio-util",
- "tracing",
- "tracing-appender",
- "tracing-subscriber",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "safekeeper"
 version = "0.1.0"
@@ -5206,6 +5158,7 @@ dependencies = [
 "tokio-io-timeout",
 "tokio-postgres",
 "tokio-stream",
+ "tokio-tar",
 "tokio-util",
 "toml_edit",
 "tracing",
@@ -5801,6 +5754,7 @@ dependencies = [
 "r2d2",
 "reqwest 0.12.4",
 "routerify",
+ "scopeguard",
 "serde",
 "serde_json",
 "strum",
@@ -5813,6 +5767,54 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "storage_scrubber"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-stream",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-smithy-async",
+ "bincode",
+ "bytes",
+ "camino",
+ "chrono",
+ "clap",
+ "crc32c",
+ "either",
+ "futures",
+ "futures-util",
+ "hex",
+ "histogram",
+ "humantime",
+ "itertools",
+ "once_cell",
+ "pageserver",
+ "pageserver_api",
+ "postgres_ffi",
+ "rand 0.8.5",
+ "remote_storage",
+ "reqwest 0.12.4",
+ "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls 0.25.0",
+ "tokio-stream",
+ "tokio-util",
+ "tracing",
+ "tracing-appender",
+ "tracing-subscriber",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "storcon_cli"
 version = "0.1.0"
@@ -5820,6 +5822,7 @@ dependencies = [
 "anyhow",
 "clap",
 "comfy-table",
+ "futures",
 "humantime",
 "hyper 0.14.26",
 "pageserver_api",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ members = [
    "safekeeper",
    "storage_broker",
    "storage_controller",
-    "s3_scrubber",
+    "storage_scrubber",
    "workspace_hack",
    "trace",
    "libs/compute_api",
@@ -120,7 +120,7 @@ num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.20.0"
-opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
@@ -128,7 +128,7 @@ parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
-prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
+prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
@@ -184,7 +184,7 @@ tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/2
+++ b/2
@@ -69,8 +69,6 @@ RUN set -e \
    && apt install -y \
        libreadline-dev \
        libseccomp-dev \
-        libicu67 \
-        openssl \
        ca-certificates \
    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
    && useradd -d /data neon \
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -112,6 +112,45 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
    && make install \
    && rm -rf ../lcov.tar.gz

+# Compile and install the static OpenSSL library
+ENV OPENSSL_VERSION=3.2.2
+ENV OPENSSL_PREFIX=/usr/local/openssl
+RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
+    echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
+    cd /tmp && \
+    tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
+    cd /tmp/openssl-${OPENSSL_VERSION} && \
+    ./config --prefix=${OPENSSL_PREFIX}  -static --static no-shared -fPIC && \
+    make -j "$(nproc)" && \
+    make install && \
+    cd /tmp && \
+    rm -rf /tmp/openssl-${OPENSSL_VERSION}
+
+# Use the same version of libicu as the compute nodes so that
+# clusters created using inidb on pageserver can be used by computes.
+#
+# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
+# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
+# Debian has a few patches on top of 67.1 that we're not adding here.
+ENV ICU_VERSION=67.1
+ENV ICU_PREFIX=/usr/local/icu
+
+# Download and build static ICU
+RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
+    echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \
+    mkdir /tmp/icu && \
+    pushd /tmp/icu && \
+    tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \
+    pushd icu/source && \
+    ./configure --prefix=${ICU_PREFIX}  --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
+    make -j "$(nproc)" && \
+    make install && \
+    popd && \
+    rm -rf icu && \
+    rm -f /tmp/libicu-${ICU_VERSION}.tgz && \
+    popd
+
 # Switch to nonroot user
 USER nonroot:nonroot
 WORKDIR /home/nonroot
@@ -141,7 +180,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.78.0
+ENV RUSTC_VERSION=1.79.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -170,3 +209,6 @@ RUN whoami \
    && rustup --version --verbose \
    && rustc --version --verbose \
    && clang --version
+
+# Set following flag to check in Makefile if its running in Docker
+RUN touch /home/nonroot/.docker_build
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,13 +241,15 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

+COPY patches/pgvector.patch /pgvector.patch

 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.1.tar.gz -O pgvector.tar.gz && \
-    echo "fe6c8cb4e0cd1a8cb60f5badf9e1701e0fcabcfc260931c26d01e155c4dd21d1 pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
+    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -926,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

+
+#########################################################################################
+#
+# Layer neon-pg-ext-test
+#
+#########################################################################################
+
+FROM neon-pg-ext-build AS neon-pg-ext-test
+ARG PG_VERSION
+RUN mkdir /ext-src
+
+#COPY --from=postgis-build /postgis.tar.gz /ext-src/
+#COPY --from=postgis-build /sfcgal/* /usr
+COPY --from=plv8-build /plv8.tar.gz /ext-src/
+COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
+COPY --from=vector-pg-build /pgvector.patch /ext-src/
+COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
+#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
+#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
+COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src
+COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src
+#COPY --from=rum-pg-build /rum.tar.gz /ext-src
+#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src
+COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src
+COPY --from=prefix-pg-build /prefix.tar.gz /ext-src
+COPY --from=hll-pg-build /hll.tar.gz /ext-src
+COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
+#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
+COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
+COPY patches/pg_hintplan.patch /ext-src
+#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
+COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
+COPY patches/pg_cron.patch /ext-src
+#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
+COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src
+COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src
+COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
+COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
+#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
+#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
+COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+COPY patches/pg_anon.patch /ext-src
+COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
+COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
+RUN cd /ext-src/ && for f in *.tar.gz; \
+    do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
+    rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
+    || exit 1; rm -f $f; done
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+# cmake is required for the h3 test
+RUN apt-get update && apt-get install -y cmake
+RUN patch -p1 < /ext-src/pg_hintplan.patch
+COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
+ENV PATH=/usr/local/pgsql/bin:$PATH
+ENV PGHOST=compute
+ENV PGPORT=55433
+ENV PGUSER=cloud_admin
+ENV PGDATABASE=postgres
 #########################################################################################
 #
 # Final layer
--- a/17
+++ b/17
@@ -3,6 +3,9 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # Where to install Postgres, default is ./pg_install, maybe useful for package managers
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

+OPENSSL_PREFIX_DIR := /usr/local/openssl
+ICU_PREFIX_DIR := /usr/local/icu
+
 #
 # We differentiate between release / debug build types using the BUILD_TYPE
 # environment variable.
@@ -20,6 +23,16 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

+ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
+	# Exclude static build openssl, icu for local build (MacOS, Linux)
+	# Only keep for build type release and debug
+	PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
+	PG_CONFIGURE_OPTS += --with-icu
+	PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
+	PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
+	PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
+endif
+
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
@@ -28,7 +41,7 @@ else ifeq ($(UNAME_S),Darwin)
 	ifndef DISABLE_HOMEBREW
 		# macOS with brew-installed openssl requires explicit paths
 		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+		OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
 		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
 		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
@@ -124,6 +137,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
+	+@echo "Compiling test_decoding $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/test_decoding install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -735,7 +735,7 @@ fn cli() -> clap::Command {
            Arg::new("filecache-connstr")
                .long("filecache-connstr")
                .default_value(
-                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable",
+                    "host=localhost port=5432 dbname=postgres user=cloud_admin sslmode=disable application_name=vm-monitor",
                )
                .value_name("FILECACHE_CONNSTR"),
        )
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -918,38 +918,39 @@ impl ComputeNode {
        // temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
        // creating new extensions, roles, etc...
-        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-        self.pg_reload_conf()?;
+        config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
+            self.pg_reload_conf()?;

-        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+            let mut client = Client::connect(self.connstr.as_str(), NoTls)?;

-        // Proceed with post-startup configuration. Note, that order of operations is important.
-        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        if spec.mode == ComputeMode::Primary {
-            client.simple_query("SET neon.forward_ddl = false")?;
-            cleanup_instance(&mut client)?;
-            handle_roles(&spec, &mut client)?;
-            handle_databases(&spec, &mut client)?;
-            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
-            handle_grants(
-                &spec,
-                &mut client,
-                self.connstr.as_str(),
-                self.has_feature(ComputeFeature::AnonExtension),
-            )?;
-            handle_extensions(&spec, &mut client)?;
-            handle_extension_neon(&mut client)?;
-            // We can skip handle_migrations here because a new migration can only appear
-            // if we have a new version of the compute_ctl binary, which can only happen
-            // if compute got restarted, in which case we'll end up inside of apply_config
-            // instead of reconfigure.
-        }
+            // Proceed with post-startup configuration. Note, that order of operations is important.
+            // Disable DDL forwarding because control plane already knows about these roles/databases.
+            if spec.mode == ComputeMode::Primary {
+                client.simple_query("SET neon.forward_ddl = false")?;
+                cleanup_instance(&mut client)?;
+                handle_roles(&spec, &mut client)?;
+                handle_databases(&spec, &mut client)?;
+                handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
+                handle_grants(
+                    &spec,
+                    &mut client,
+                    self.connstr.as_str(),
+                    self.has_feature(ComputeFeature::AnonExtension),
+                )?;
+                handle_extensions(&spec, &mut client)?;
+                handle_extension_neon(&mut client)?;
+                // We can skip handle_migrations here because a new migration can only appear
+                // if we have a new version of the compute_ctl binary, which can only happen
+                // if compute got restarted, in which case we'll end up inside of apply_config
+                // instead of reconfigure.
+            }

-        // 'Close' connection
-        drop(client);
+            // 'Close' connection
+            drop(client);
+
+            Ok(())
+        })?;

-        // reset max_cluster_size in config back to original value and reload config
-        config::compute_ctl_temp_override_remove(pgdata_path)?;
        self.pg_reload_conf()?;

        let unknown_op = "unknown".to_string();
@@ -1040,12 +1041,17 @@ impl ComputeNode {
                // temporarily reset max_cluster_size in config
                // to avoid the possibility of hitting the limit, while we are applying config:
                // creating new extensions, roles, etc...
-                config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-                self.pg_reload_conf()?;
+                config::with_compute_ctl_tmp_override(
+                    pgdata_path,
+                    "neon.max_cluster_size=-1",
+                    || {
+                        self.pg_reload_conf()?;

-                self.apply_config(&compute_state)?;
+                        self.apply_config(&compute_state)?;

-                config::compute_ctl_temp_override_remove(pgdata_path)?;
+                        Ok(())
+                    },
+                )?;
                self.pg_reload_conf()?;
            }
            self.post_apply_config()?;
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -131,18 +131,17 @@ pub fn write_postgres_conf(
    Ok(())
 }

-/// create file compute_ctl_temp_override.conf in pgdata_dir
-/// add provided options to this file
-pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+pub fn with_compute_ctl_tmp_override<F>(pgdata_path: &Path, options: &str, exec: F) -> Result<()>
+where
+    F: FnOnce() -> Result<()>,
+{
    let path = pgdata_path.join("compute_ctl_temp_override.conf");
    let mut file = File::create(path)?;
    write!(file, "{}", options)?;
-    Ok(())
-}

-/// remove file compute_ctl_temp_override.conf in pgdata_dir
-pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
-    let path = pgdata_path.join("compute_ctl_temp_override.conf");
-    std::fs::remove_file(path)?;
-    Ok(())
+    let res = exec();
+
+    file.set_len(0)?;
+
+    res
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -17,7 +17,7 @@ use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use tokio::task;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
 use utils::http::request::must_get_query_param;

@@ -48,7 +48,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
    match (req.method(), req.uri().path()) {
        // Serialized compute state.
        (&Method::GET, "/status") => {
-            info!("serving /status GET request");
+            debug!("serving /status GET request");
            let state = compute.state.lock().unwrap();
            let status_response = status_response_from_state(&state);
            Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
--- a/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
+++ b/compute_tools/src/migrations/0000-neon_superuser_bypass_rls.sql
@@ -0,0 +1 @@
+ALTER ROLE neon_superuser BYPASSRLS;
--- a/compute_tools/src/migrations/0001-alter_roles.sql
+++ b/compute_tools/src/migrations/0001-alter_roles.sql
@@ -0,0 +1,18 @@
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
--- a/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql
@@ -0,0 +1,6 @@
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END $$;
--- a/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_monitor_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0004-grant_all_on_tables_to_neon_superuser.sql
@@ -0,0 +1,4 @@
+-- SKIP: Deemed insufficient for allowing relations created by extensions to be
+--       interacted with by neon_superuser without permission issues.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
--- a/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_sequences_to_neon_superuser.sql
@@ -0,0 +1,4 @@
+-- SKIP: Deemed insufficient for allowing relations created by extensions to be
+--       interacted with by neon_superuser without permission issues.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
--- a/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,3 @@
+-- SKIP: Moved inline to the handle_grants() functions.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
@@ -0,0 +1,3 @@
+-- SKIP: Moved inline to the handle_grants() functions.
+
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0008-revoke_replication_for_previously_allowed_roles.sql
@@ -0,0 +1,13 @@
+-- SKIP: The original goal of this migration was to prevent creating
+--       subscriptions, but this migration was insufficient.
+
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END $$;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -774,44 +774,21 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

+    // Add new migrations in numerical order.
    let migrations = [
-        "ALTER ROLE neon_superuser BYPASSRLS",
-        r#"
-DO $$
-DECLARE
-    role_name text;
-BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
-    END LOOP;
-
-    FOR role_name IN SELECT rolname FROM pg_roles
-        WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
-    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
-    END LOOP;
-END $$;
-"#,
-        r#"
-DO $$
-BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
-    END IF;
-END
-$$;"#,
-        "GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
-        // Don't remove: these are some SQLs that we originally applied in migrations but turned out to execute somewhere else.
-        "",
-        "",
-        "",
-        "",
-        "",
-        // Add new migrations below.
+        include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
+        include_str!("./migrations/0001-alter_roles.sql"),
+        include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
+        include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
+        include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
+        include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
+        include_str!(
+            "./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!(
+            "./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+        ),
+        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

    let mut func = || {
@@ -847,10 +824,13 @@ $$;"#,

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
-        if migration.is_empty() {
-            info!("Skip migration id={}", current_migration);
+        if migration.starts_with("-- SKIP") {
+            info!("Skipping migration id={}", current_migration);
        } else {
-            info!("Running migration:\n{}\n", migration);
+            info!(
+                "Running migration id={}:\n{}\n",
+                current_migration, migration
+            );
            client.simple_query(migration).with_context(|| {
                format!("handle_migrations current_migration={}", current_migration)
            })?;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -69,6 +69,9 @@ where
    // Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
    EI: IntoIterator<Item = (String, String)>,
 {
+    if !datadir.metadata().context("stat datadir")?.is_dir() {
+        anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
+    }
    let log_path = datadir.join(format!("{process_name}.log"));
    let process_log_file = fs::OpenOptions::new()
        .create(true)
@@ -85,7 +88,13 @@ where
    let background_command = command
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
-        .args(args);
+        .args(args)
+        // spawn all child processes in their datadir, useful for all kinds of things,
+        // not least cleaning up child processes e.g. after an unclean exit from the test suite:
+        // ```
+        // lsof  -d cwd -a +D  Users/cs/src/neon/test_output
+        // ```
+        .current_dir(datadir);

    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
        fill_rust_env_vars(background_command),
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -87,7 +87,8 @@ fn main() -> Result<()> {
        handle_init(sub_args).map(Some)
    } else {
        // all other commands need an existing config
-        let mut env = LocalEnv::load_config().context("Error loading config")?;
+        let mut env =
+            LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
        let original_env = env.clone();

        let rt = tokio::runtime::Builder::new_current_thread()
@@ -364,7 +365,8 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    LocalEnv::init(init_conf, force)
        .context("materialize initial neon_local environment on disk")?;
-    Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
+    Ok(LocalEnv::load_config(&local_env::base_path())
+        .expect("freshly written config should be loadable"))
 }

 /// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -42,8 +42,8 @@ pub struct LocalEnv {
    // compute endpoints).
    //
    // This is not stored in the config file. Rather, this is the path where the
-    // config file itself is. It is read from the NEON_REPO_DIR env variable or
-    // '.neon' if not given.
+    // config file itself is. It is read from the NEON_REPO_DIR env variable which
+    // must be an absolute path. If the env var is not set, $PWD/.neon is used.
    pub base_data_dir: PathBuf,

    // Path to postgres distribution. It's expected that "bin", "include",
@@ -431,9 +431,7 @@ impl LocalEnv {
    }

    ///  Construct `Self` from on-disk state.
-    pub fn load_config() -> anyhow::Result<Self> {
-        let repopath = base_path();
-
+    pub fn load_config(repopath: &Path) -> anyhow::Result<Self> {
        if !repopath.exists() {
            bail!(
                "Neon config is not found in {}. You need to run 'neon_local init' first",
@@ -461,7 +459,7 @@ impl LocalEnv {
                branch_name_mappings,
            } = on_disk_config;
            LocalEnv {
-                base_data_dir: repopath.clone(),
+                base_data_dir: repopath.to_owned(),
                pg_distrib_dir,
                neon_distrib_dir,
                default_tenant_id,
@@ -482,7 +480,7 @@ impl LocalEnv {
            "we ensure this during deserialization"
        );
        env.pageservers = {
-            let iter = std::fs::read_dir(&repopath).context("open dir")?;
+            let iter = std::fs::read_dir(repopath).context("open dir")?;
            let mut pageservers = Vec::new();
            for res in iter {
                let dentry = res?;
@@ -719,10 +717,25 @@ impl LocalEnv {
 }

 pub fn base_path() -> PathBuf {
-    match std::env::var_os("NEON_REPO_DIR") {
-        Some(val) => PathBuf::from(val),
-        None => PathBuf::from(".neon"),
-    }
+    let path = match std::env::var_os("NEON_REPO_DIR") {
+        Some(val) => {
+            let path = PathBuf::from(val);
+            if !path.is_absolute() {
+                // repeat the env var in the error because our default is always absolute
+                panic!("NEON_REPO_DIR must be an absolute path, got {path:?}");
+            }
+            path
+        }
+        None => {
+            let pwd = std::env::current_dir()
+                // technically this can fail but it's quite unlikeley
+                .expect("determine current directory");
+            let pwd_abs = pwd.canonicalize().expect("canonicalize current directory");
+            pwd_abs.join(".neon")
+        }
+    };
+    assert!(path.is_absolute());
+    path
 }

 /// Generate a public/private key pair for JWT authentication
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -383,6 +383,10 @@ impl PageServerNode {
                .map(|x| x.parse::<AuxFilePolicy>())
                .transpose()
                .context("Failed to parse 'switch_aux_file_policy'")?,
+            lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+            lsn_lease_length_for_ts: settings
+                .remove("lsn_lease_length_for_ts")
+                .map(|x| x.to_string()),
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -506,6 +510,10 @@ impl PageServerNode {
                    .map(|x| x.parse::<AuxFilePolicy>())
                    .transpose()
                    .context("Failed to parse 'switch_aux_file_policy'")?,
+                lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
+                lsn_lease_length_for_ts: settings
+                    .remove("lsn_lease_length_for_ts")
+                    .map(|x| x.to_string()),
            }
        };

--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -14,6 +14,7 @@ use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
+use utils::auth::{Claims, Scope};
 use utils::{http::error::HttpErrorBody, id::NodeId};

 use crate::{
@@ -197,7 +198,7 @@ impl SafekeeperNode {
            &datadir,
            &self.env.safekeeper_bin(),
            &args,
-            [],
+            self.safekeeper_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
                match self.check_status().await {
@@ -210,6 +211,18 @@ impl SafekeeperNode {
        .await
    }

+    fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
+        // Generate a token to connect from safekeeper to peers
+        if self.conf.auth_enabled {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
+            Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
+        } else {
+            Ok(Vec::new())
+        }
+    }
+
    ///
    /// Stop the server.
    ///
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -46,6 +46,7 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
+    pub generation_override: Option<i32>,
 }

 #[derive(Serialize, Deserialize)]
@@ -313,15 +314,17 @@ impl StorageController {
            args.push(format!("--split-threshold={split_threshold}"))
        }

+        args.push(format!(
+            "--neon-local-repo-dir={}",
+            self.env.base_data_dir.display()
+        ));
+
        background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.storage_controller_bin(),
            args,
-            [(
-                "NEON_REPO_DIR".to_string(),
-                self.env.base_data_dir.to_string_lossy().to_string(),
-            )],
+            [],
            background_process::InitialPidFile::Create(self.pid_file()),
            || async {
                match self.ready().await {
@@ -440,6 +443,7 @@ impl StorageController {
        let request = AttachHookRequest {
            tenant_shard_id,
            node_id: Some(pageserver_id),
+            generation_override: None,
        };

        let response = self
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
+futures.workspace = true
 humantime.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,3 +1,4 @@
+use futures::StreamExt;
 use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
@@ -148,6 +149,22 @@ enum Command {
        #[arg(long)]
        threshold: humantime::Duration,
    },
+    // Drain a set of specified pageservers by moving the primary attachments to pageservers
+    // outside of the specified set.
+    Drain {
+        // Set of pageserver node ids to drain.
+        #[arg(long)]
+        nodes: Vec<NodeId>,
+        // Optional: migration concurrency (default is 8)
+        #[arg(long)]
+        concurrency: Option<usize>,
+        // Optional: maximum number of shards to migrate
+        #[arg(long)]
+        max_shards: Option<usize>,
+        // Optional: when set to true, nothing is migrated, but the plan is printed to stdout
+        #[arg(long)]
+        dry_run: Option<bool>,
+    },
 }

 #[derive(Parser)]
@@ -737,6 +754,194 @@ async fn main() -> anyhow::Result<()> {
                })
                .await?;
        }
+        Command::Drain {
+            nodes,
+            concurrency,
+            max_shards,
+            dry_run,
+        } => {
+            // Load the list of nodes, split them up into the drained and filled sets,
+            // and validate that draining is possible.
+            let node_descs = storcon_client
+                .dispatch::<(), Vec<NodeDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/node".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut node_to_drain_descs = Vec::new();
+            let mut node_to_fill_descs = Vec::new();
+
+            for desc in node_descs {
+                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                if to_drain {
+                    node_to_drain_descs.push(desc);
+                } else {
+                    node_to_fill_descs.push(desc);
+                }
+            }
+
+            if nodes.len() != node_to_drain_descs.len() {
+                anyhow::bail!("Drain requested for node which doesn't exist.")
+            }
+
+            node_to_fill_descs.retain(|desc| {
+                matches!(desc.availability, NodeAvailabilityWrapper::Active)
+                    && matches!(
+                        desc.scheduling,
+                        NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling
+                    )
+            });
+
+            if node_to_fill_descs.is_empty() {
+                anyhow::bail!("There are no nodes to drain to")
+            }
+
+            // Set the node scheduling policy to draining for the nodes which
+            // we plan to drain.
+            for node_desc in node_to_drain_descs.iter() {
+                let req = NodeConfigureRequest {
+                    node_id: node_desc.id,
+                    availability: None,
+                    scheduling: Some(NodeSchedulingPolicy::Draining),
+                };
+
+                storcon_client
+                    .dispatch::<_, ()>(
+                        Method::PUT,
+                        format!("control/v1/node/{}/config", node_desc.id),
+                        Some(req),
+                    )
+                    .await?;
+            }
+
+            // Perform the drain: move each tenant shard scheduled on a node to
+            // be drained to a node which is being filled. A simple round robin
+            // strategy is used to pick the new node.
+            let tenants = storcon_client
+                .dispatch::<(), Vec<TenantDescribeResponse>>(
+                    Method::GET,
+                    "control/v1/tenant".to_string(),
+                    None,
+                )
+                .await?;
+
+            let mut selected_node_idx = 0;
+
+            struct DrainMove {
+                tenant_shard_id: TenantShardId,
+                from: NodeId,
+                to: NodeId,
+            }
+
+            let mut moves: Vec<DrainMove> = Vec::new();
+
+            let shards = tenants
+                .into_iter()
+                .flat_map(|tenant| tenant.shards.into_iter());
+            for shard in shards {
+                if let Some(max_shards) = max_shards {
+                    if moves.len() >= max_shards {
+                        println!(
+                            "Stop planning shard moves since the requested maximum was reached"
+                        );
+                        break;
+                    }
+                }
+
+                let should_migrate = {
+                    if let Some(attached_to) = shard.node_attached {
+                        node_to_drain_descs
+                            .iter()
+                            .map(|desc| desc.id)
+                            .any(|id| id == attached_to)
+                    } else {
+                        false
+                    }
+                };
+
+                if !should_migrate {
+                    continue;
+                }
+
+                moves.push(DrainMove {
+                    tenant_shard_id: shard.tenant_shard_id,
+                    from: shard
+                        .node_attached
+                        .expect("We only migrate attached tenant shards"),
+                    to: node_to_fill_descs[selected_node_idx].id,
+                });
+                selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len();
+            }
+
+            let total_moves = moves.len();
+
+            if dry_run == Some(true) {
+                println!("Dryrun requested. Planned {total_moves} moves:");
+                for mv in &moves {
+                    println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to)
+                }
+
+                return Ok(());
+            }
+
+            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
+            let mut stream = futures::stream::iter(moves)
+                .map(|mv| {
+                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
+                    async move {
+                        client
+                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
+                                Method::PUT,
+                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
+                                Some(TenantShardMigrateRequest {
+                                    tenant_shard_id: mv.tenant_shard_id,
+                                    node_id: mv.to,
+                                }),
+                            )
+                            .await
+                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
+                    }
+                })
+                .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY));
+
+            let mut success = 0;
+            let mut failure = 0;
+
+            while let Some(res) = stream.next().await {
+                match res {
+                    Ok(_) => {
+                        success += 1;
+                    }
+                    Err((tenant_shard_id, from, to, error)) => {
+                        failure += 1;
+                        println!(
+                            "Failed to migrate {} from node {} to node {}: {}",
+                            tenant_shard_id, from, to, error
+                        );
+                    }
+                }
+
+                if (success + failure) % 20 == 0 {
+                    println!(
+                        "Processed {}/{} shards: {} succeeded, {} failed",
+                        success + failure,
+                        total_moves,
+                        success,
+                        failure
+                    );
+                }
+            }
+
+            println!(
+                "Processed {}/{} shards: {} succeeded, {} failed",
+                success + failure,
+                total_moves,
+                success,
+                failure
+            );
+        }
    }

    Ok(())
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -8,6 +8,11 @@ USER root
 RUN apt-get update &&       \
    apt-get install -y curl \
                       jq   \
+                       python3-pip \
                       netcat
+#Faker is required for the pg_anon test
+RUN pip3 install Faker
+#This is required for the pg_hintplan test
+RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 

-USER postgres
+USER postgres
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
@@ -95,7 +95,7 @@
            },
            {
                "name": "shared_preload_libraries",
-                "value": "neon",
+                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
                "vartype": "string"
            },
            {
@@ -127,6 +127,16 @@
                "name": "max_replication_flush_lag",
                "value": "10GB",
                "vartype": "string"
+            },
+            {
+                "name": "cron.database",
+                "value": "postgres",
+                "vartype": "string"
+            },
+            {
+                "name": "session_preload_libraries",
+                "value": "anon",
+                "vartype": "string"
            }
        ]
    },
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3'
-
 services:
  minio:
    restart: always
@@ -161,12 +159,12 @@ services:
      context: ./compute_wrapper/
      args:
        - REPOSITORY=${REPOSITORY:-neondatabase}
-        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16}
        - TAG=${TAG:-latest}
        - http_proxy=$http_proxy
        - https_proxy=$https_proxy
    environment:
-      - PG_VERSION=${PG_VERSION:-14}
+      - PG_VERSION=${PG_VERSION:-16}
      #- RUST_BACKTRACE=1
    # Mount the test files directly, for faster editing cycle.
    volumes:
@@ -194,3 +192,14 @@ services:
         done"
    depends_on:
      - compute
+
+  neon-test-extensions:
+    profiles: ["test-extensions"]
+    image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest}
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - sleep 1800
+    depends_on:
+      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -7,52 +7,94 @@
 # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
 # Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
 # to verify custom image builds (e.g pre-published ones).
-
+#
+# A test script for postgres extensions
+# Currently supports only v16
+#
 set -eux -o pipefail

-SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
-COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
-
+COMPOSE_FILE='docker-compose.yml'
+cd $(dirname $0)
 COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
-PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
+: ${http_proxy:=}
+: ${https_proxy:=}
+export http_proxy https_proxy

 cleanup() {
    echo "show container information"
    docker ps
-    docker compose -f $COMPOSE_FILE logs
+    docker compose --profile test-extensions -f $COMPOSE_FILE logs
    echo "stop containers..."
-    docker compose -f $COMPOSE_FILE down
+    docker compose --profile test-extensions -f $COMPOSE_FILE down
 }

-echo "clean up containers if exists"
-cleanup
-
 for pg_version in 14 15 16; do
-    echo "start containers (pg_version=$pg_version)."
-    PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
+    echo "clean up containers if exists"
+    cleanup
+    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
-    while sleep 1; do
+    while sleep 3; do
        # check timeout
-        cnt=`expr $cnt + 1`
+        cnt=`expr $cnt + 3`
        if [ $cnt -gt 60 ]; then
            echo "timeout before the compute is ready."
            cleanup
            exit 1
        fi
-
-        # check if the compute is ready
-        set +o pipefail
-        result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
-        set -o pipefail
-        if [ $result -eq 1 ]; then
+        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
-            cleanup
            break
        fi
    done
+
+    if [ $pg_version -ge 16 ]
+    then
+        echo Enabling trust connection
+        docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
+        echo Adding postgres role
+        docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN"
+        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
+        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
+        echo Adding dummy config
+        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # This block is required for the pg_anon extension test.
+        # The test assumes that it is running on the same host with the postgres engine.
+        # In our case it's not true, that's why we are copying files to the compute node
+        TMPDIR=$(mktemp -d)
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        rm -rf $TMPDIR
+        TMPDIR=$(mktemp -d)
+        # The following block does the same for the pg_hintplan test
+        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
+        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
+        rm -rf $TMPDIR
+        # We are running tests now
+        if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
+            $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
+        then
+            cleanup
+        else
+            FAILED=$(tail -1 testout.txt)
+            for d in $FAILED
+            do
+                mkdir $d
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true
+                docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true
+                cat $d/regression.out $d/regression.diffs || true
+            done
+        rm -rf $FAILED
+        cleanup
+        exit 1
+        fi
+    fi
+    cleanup
 done
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -x
+
+cd /ext-src
+FAILED=
+LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
+for d in ${LIST}
+do
+       [ -d ${d} ] || continue
+    psql -c "select 1" >/dev/null || break
+       make -C ${d} installcheck || FAILED="${d} ${FAILED}"
+done
+[ -z "${FAILED}" ] && exit 0
+echo ${FAILED}
+exit 1
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -11,15 +11,28 @@ page server. We currently use the same binary for both, with --wal-redo runtime
 the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
 the WAL redo process.

-In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
-smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
-way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.
+In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that
+hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into
+the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the
+extensions, and for any new features, that is preferred over modifying core PostgreSQL code.

 Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
 compute, and changes needed for the WAL redo process:

 # Changes for Compute node

+## Prefetching
+
+There are changes in many places to perform prefetching, for example for sequential scans. Neon
+doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local
+disk, so prefetching is critical for performance, also for sequential scans.
+
+### How to get rid of the patch
+
+Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully
+do more.
+
+
 ## Add t_cid to heap WAL records

 ```
@@ -37,54 +50,11 @@ The problem is that the XLOG_HEAP_INSERT record does not include the command id

 Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.

+Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded.

 ### Alternatives
 Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.

-## ginfast.c
-
-```
-diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
-index e0d9940946..2d964c02e9 100644
--- a/src/backend/access/gin/ginfast.c
-+++ b/src/backend/access/gin/ginfast.c
-@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
-                memset(&sublist, 0, sizeof(GinMetaPageData));
-                makeSublist(index, collector->tuples, collector->ntuples, &sublist);
- 
-+               if (metadata->head != InvalidBlockNumber)
-+               {
-+                       /*
-+                        * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
-+                        * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
-+                        * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
-+                        * will try to WAL-log an image of the page.
-+                        */
-+                       buffer = ReadBuffer(index, metadata->tail);
-+               }
-+
-                if (needWal)
-                        XLogBeginInsert();
- 
-@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
-                        data.prevTail = metadata->tail;
-                        data.newRightlink = sublist.head;
- 
-                       buffer = ReadBuffer(index, metadata->tail);
-                        LockBuffer(buffer, GIN_EXCLUSIVE);
-                        page = BufferGetPage(buffer);
-```
-
-The problem is explained in the comment above
-
-### How to get rid of the patch
-
-Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
-section or something.
-
-Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?
-
-
 ## Mark index builds that use buffer manager without logging explicitly

 ```
@@ -95,6 +65,8 @@ Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and
 also some changes in src/backend/storage/smgr/smgr.c
 ```

+pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too.
+
 When a GIN index is built, for example, it is built by inserting the entries into the index more or
 less normally, but without WAL-logging anything. After the index has been built, we iterate through
 all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
@@ -109,6 +81,10 @@ an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1`
 I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
 changes to a patch and post to pgsql-hackers.

+Perhaps we could deduce that an unlogged index build has started when we see a page being evicted
+with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a
+check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the
+index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that.

 ## Track last-written page LSN

@@ -140,57 +116,6 @@ The old method is still available, though.
 Wait until v15?


-## Cache relation sizes
-
-The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
-to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
-relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
-Neon)
-
-
-## Use buffer manager when extending VM or FSM
-
-```
- src/backend/storage/freespace/freespace.c                   |   14 +-
- src/backend/access/heap/visibilitymap.c                     |   15 +-
-
-diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
-index e198df65d8..addfe93eac 100644
--- a/src/backend/access/heap/visibilitymap.c
-+++ b/src/backend/access/heap/visibilitymap.c
-@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
-        /* Now extend the file */
-        while (vm_nblocks_now < vm_nblocks)
-        {
-               PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
-+               /*
-+                * ZENITH: Initialize VM pages through buffer cache to prevent loading
-+                * them from pageserver.
-+                */
-+               Buffer  buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
-+                                                                                       RBM_ZERO_AND_LOCK, NULL);
-+               Page    page = BufferGetPage(buffer);
-+
-+               PageInit((Page) page, BLCKSZ, 0);
-+               PageSetChecksumInplace(page, vm_nblocks_now);
-+               MarkBufferDirty(buffer);
-+               UnlockReleaseBuffer(buffer);
- 
-               smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
-                                  pg.data, false);
-                vm_nblocks_now++;
-        }
-```
-
-### Problem we're trying to solve
-
-???
-
-### How to get rid of the patch
-
-Maybe this would be a reasonable change in PostgreSQL too?
-
-
 ## Allow startup without reading checkpoint record

 In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
@@ -231,7 +156,7 @@ index 0415df9ccb..9f9db3c8bc 100644
  * crash we can lose (skip over) as many values as we pre-logged.
  */
 -#define SEQ_LOG_VALS   32
-+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
 +/* #define SEQ_LOG_VALS        32 */
 +#define SEQ_LOG_VALS   0
 ```
@@ -250,66 +175,6 @@ would be weird if the sequence moved backwards though, think of PITR.
 Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.


-## Walproposer
-
-```
- src/Makefile                                                |    1 +
- src/backend/replication/libpqwalproposer/Makefile           |   37 +
- src/backend/replication/libpqwalproposer/libpqwalproposer.c |  416 ++++++++++++
- src/backend/postmaster/bgworker.c                           |    4 +
- src/backend/postmaster/postmaster.c                         |    6 +
- src/backend/replication/Makefile                            |    4 +-
- src/backend/replication/walproposer.c                       | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- src/backend/replication/walproposer_utils.c                 |  402 +++++++++++
- src/backend/replication/walreceiver.c                       |    7 +
- src/backend/replication/walsender.c                         |  320 ++++++---
- src/backend/storage/ipc/ipci.c                              |    6 +
- src/include/replication/walproposer.h                       |  565 ++++++++++++++++
-```
-
-WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.  It is
-currently implemented as patch to standard WAL sender.
-
-### How to get rid of the patch
-
-Refactor into an extension. Submit hooks or APIs into upstream if necessary.
-
-@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96
-
-## Ignore unexpected data beyond EOF in bufmgr.c
-
-```
-@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
-                 */
-                bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
-                if (!PageIsNew((Page) bufBlock))
-                       ereport(ERROR,
-+               {
-+                        // XXX-ZENITH
-+                        MemSet((char *) bufBlock, 0, BLCKSZ);
-+                        ereport(DEBUG1,
-                                        (errmsg("unexpected data beyond EOF in block %u of relation %s",
-                                                        blockNum, relpath(smgr->smgr_rnode, forkNum)),
-                                         errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
-+               }
-                /*
-                 * We *must* do smgrextend before succeeding, else the page will not
-                 * be reserved by the kernel, and the next P_NEW call will decide to
-```
-
-PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
-first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
-a relation at the same time, the pages can be WAL-logged in different order.
-
-I'm not sure what scenario exactly required this change in Neon, though.
-
-### How to get rid of the patch
-
-Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
-confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
-and finally WAL-log that the extension succeeded.
-
 ## Make smgr interface available to extensions

 ```
@@ -321,6 +186,8 @@ and finally WAL-log that the extension succeeded.

 Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.

+We have submitted this to upstream, but it's moving at glacial a speed.
+https://commitfest.postgresql.org/47/4428/

 ## Added relpersistence argument to smgropen()

@@ -444,6 +311,148 @@ Ignore it. This is only needed for disaster recovery, so once we've eliminated a
 patches, we can just keep it around as a patch or as separate branch in a repo.


+## pg_waldump flags to ignore errors
+
+After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors.
+
+### How to get rid of the patch
+
+Like previous one, ignore it.
+
+
+
+## Backpressure if pageserver doesn't ingest WAL fast enough
+
+```
+@@ -3200,6 +3202,7 @@ ProcessInterrupts(void)
+                return;
+        InterruptPending = false;
+ 
+retry:
+        if (ProcDiePending)
+        {
+                ProcDiePending = false;
+@@ -3447,6 +3450,13 @@ ProcessInterrupts(void)
+ 
+        if (ParallelApplyMessagePending)
+                HandleParallelApplyMessages();
+
+       /* Call registered callback if any */
+       if (ProcessInterruptsCallback)
+       {
+               if (ProcessInterruptsCallback())
+                       goto retry;
+       }
+ }
+```
+
+
+### How to get rid of the patch
+
+Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions
+too.
+
+
+## SLRU on-demand download
+
+```
+ src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 92 insertions(+), 13 deletions(-)
+```
+
+### Problem we're trying to solve
+
+Previously, SLRU files were included in the basebackup, but the total size of them can be large,
+several GB, and downloading them all made the startup time too long.
+
+### Alternatives
+
+FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files
+
+
+## WAL-log an all-zeros page as one large hole
+
+- In XLogRecordAssemble()
+
+### Problem we're trying to solve
+
+This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends
+it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL.
+
+### How to get rid of the patch
+
+Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps.
+
+
+## Shut down walproposer after checkpointer
+
+```
+                       /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */
+                       if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) &&
+```
+
+This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers.
+
+### How to get rid of the patch
+
+Do a bigger refactoring of the postmaster state machine, such that a background worker can specify
+the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and
+would benefit from a refactoring for the sake of readability anyway.
+
+
+## EXPLAIN changes for prefetch and LFC
+
+### How to get rid of the patch
+
+Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state.
+
+
+## On-demand download of extensions
+
+### How to get rid of the patch
+
+FUSE or LD_PRELOAD trickery to intercept reads?
+
+
+## Publication superuser checks
+
+We have hacked CreatePublication so that also neon_superuser can create them.
+
+### How to get rid of the patch
+
+Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users.
+
+
+## WAL log replication slots
+
+### How to get rid of the patch
+
+Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else.
+
+
+## WAL-log replication snapshots
+
+### How to get rid of the patch
+
+WAL-log them periodically, from a backgound worker.
+
+
+## WAL-log relmapper files
+
+Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged
+
+### How to get rid of the patch
+
+WAL-log them periodically, from a backgound worker.
+
+
+## XLogWaitForReplayOf()
+
+??
+
+
+
+
 # Not currently committed but proposed

 ## Disable ring buffer buffer manager strategies
@@ -472,23 +481,10 @@ hint bits are set. Wal logging hint bits updates requires FPI which significantl

 Add special WAL record for setting page hints.

-## Prefetching
-
-### Why?
-
-As far as pages in Neon are loaded on demand, to reduce node startup time
-and also speedup some massive queries we need some mechanism for bulk loading to
-reduce page request round-trip overhead.
-
-Currently Postgres is supporting prefetching only for bitmap scan.
-In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
-For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
-of heap relation addressed by TIDs.
-
 ## Prewarming

 ### Why?

-Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
+Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon.
 But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
 We can capture state of compute node buffer cache and send bulk request for this pages at startup.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -4,18 +4,18 @@

 Currently we build two main images:

- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
+- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.

 And additional intermediate image:

 - [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.

-## Building pipeline
+## Build pipeline

 We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs

-1. `neondatabase/compute-tools` and `neondatabase/compute-node`
+1. `neondatabase/compute-tools` and `neondatabase/compute-node-v16` (and -v15 and -v14)

 2. `neondatabase/neon`

@@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea
 1. create containers

 You can specify version of neon cluster using following environment values.
- PG_VERSION: postgres version for compute (default is 14)
- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+- PG_VERSION: postgres version for compute (default is 16 as of this writing)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags), which is tagged in [CI test](/.github/workflows/build_and_test.yml). Default is 'latest'
 ```
 $ cd docker-compose/
 $ docker-compose down   # remove the containers if exists
-$ PG_VERSION=15 TAG=2937 docker-compose up --build -d  # You can specify the postgres and image version
+$ PG_VERSION=16 TAG=latest docker-compose up --build -d  # You can specify the postgres and image version
 Creating network "dockercompose_default" with the default driver
 Creating docker-compose_storage_broker_1       ... done
 (...omit...)
@@ -47,29 +47,31 @@ Creating docker-compose_storage_broker_1       ... done

 2. connect compute node
 ```
-$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
-$ psql -h localhost -p 55433 -U cloud_admin
+$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres
+psql (16.3)
+Type "help" for help.
+
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
-postgres=# insert into t values(1,1);
+postgres=# insert into t values(1, 1);
 INSERT 0 1
 postgres=# select * from t;
- key | value
+ key | value 
 -----+-------
   1 | 1
 (1 row)
+
 ```

 3. If you want to see the log, you can use `docker-compose logs` command.
 ```
 # check the container name you want to see
 $ docker ps
-CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
-d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                      NAMES
+3582f6d76227   docker-compose_compute                             "/shell/compute.sh"      2 minutes ago   Up 2 minutes   0.0.0.0:3080->3080/tcp, :::3080->3080/tcp, 0.0.0.0:55433->55433/tcp, :::55433->55433/tcp   docker-compose_compute_1
 (...omit...)

-$ docker logs -f dockercompose_compute_1
+$ docker logs -f docker-compose_compute_1
 2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
 2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
 (...omit...)
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -101,11 +101,12 @@ or
 ```toml
 [remote_storage]
 container_name = 'some-container-name'
+storage_account = 'somestorageaccnt'
 container_region = 'us-east'
 prefix_in_container = '/test-prefix/'
 ```

-`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.
+The `AZURE_STORAGE_ACCESS_KEY` env variable can be used to specify the azure credentials if needed.

 ## Repository background tasks

--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -209,6 +209,7 @@ pub enum NodeSchedulingPolicy {
    Active,
    Filling,
    Pause,
+    PauseForRestart,
    Draining,
 }

@@ -220,6 +221,7 @@ impl FromStr for NodeSchedulingPolicy {
            "active" => Ok(Self::Active),
            "filling" => Ok(Self::Filling),
            "pause" => Ok(Self::Pause),
+            "pause_for_restart" => Ok(Self::PauseForRestart),
            "draining" => Ok(Self::Draining),
            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
        }
@@ -233,6 +235,7 @@ impl From<NodeSchedulingPolicy> for String {
            Active => "active",
            Filling => "filling",
            Pause => "pause",
+            PauseForRestart => "pause_for_restart",
            Draining => "draining",
        }
        .to_string()
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -558,6 +558,12 @@ impl KeySpaceRandomAccum {
        self.ranges.push(range);
    }

+    pub fn add_keyspace(&mut self, keyspace: KeySpace) {
+        for range in keyspace.ranges {
+            self.add_range(range);
+        }
+    }
+
    pub fn to_keyspace(mut self) -> KeySpace {
        let mut ranges = Vec::new();
        if !self.ranges.is_empty() {
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -177,6 +177,20 @@ serde_with::serde_conv!(
    |value: String| -> Result<_, humantime::TimestampError> { humantime::parse_rfc3339(&value) }
 );

+impl LsnLease {
+    /// The default length for an explicit LSN lease request (10 minutes).
+    pub const DEFAULT_LENGTH: Duration = Duration::from_secs(10 * 60);
+
+    /// The default length for an implicit LSN lease granted during
+    /// `get_lsn_by_timestamp` request (1 minutes).
+    pub const DEFAULT_LENGTH_FOR_TS: Duration = Duration::from_secs(60);
+
+    /// Checks whether the lease is expired.
+    pub fn is_expired(&self, now: &SystemTime) -> bool {
+        now > &self.valid_until
+    }
+}
+
 /// The only [`TenantState`] variants we could be `TenantState::Activating` from.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum ActivatingFrom {
@@ -322,6 +336,8 @@ pub struct TenantConfig {
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
    pub switch_aux_file_policy: Option<AuxFilePolicy>,
+    pub lsn_lease_length: Option<String>,
+    pub lsn_lease_length_for_ts: Option<String>,
 }

 /// The policy for the aux file storage. It can be switched through `switch_aux_file_policy`
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -54,7 +54,10 @@ impl AzureBlobStorage {
            azure_config.container_name
        );

-        let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
+        // Use the storage account from the config by default, fall back to env var if not present.
+        let account = azure_config.storage_account.clone().unwrap_or_else(|| {
+            env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT")
+        });

        // If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
        // otherwise try the token based credentials.
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -466,7 +466,11 @@ impl GenericRemoteStorage {
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
-                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
+                let storage_account = azure_config
+                    .storage_account
+                    .as_deref()
+                    .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
+                info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
            }
@@ -589,6 +593,8 @@ impl Debug for S3Config {
 pub struct AzureConfig {
    /// Name of the container to connect to.
    pub container_name: String,
+    /// Name of the storage account the container is inside of
+    pub storage_account: Option<String>,
    /// The region where the bucket is located at.
    pub container_region: String,
    /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
@@ -603,8 +609,9 @@ impl Debug for AzureConfig {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("AzureConfig")
            .field("bucket_name", &self.container_name)
+            .field("storage_account", &self.storage_account)
            .field("bucket_region", &self.container_region)
-            .field("prefix_in_bucket", &self.prefix_in_container)
+            .field("prefix_in_container", &self.prefix_in_container)
            .field("concurrency_limit", &self.concurrency_limit)
            .field(
                "max_keys_per_list_response",
@@ -718,6 +725,12 @@ impl RemoteStorageConfig {
            (None, None, None, Some(container_name), Some(container_region)) => {
                RemoteStorageKind::AzureContainer(AzureConfig {
                    container_name: parse_toml_string("container_name", container_name)?,
+                    storage_account: toml
+                        .get("storage_account")
+                        .map(|storage_account| {
+                            parse_toml_string("storage_account", storage_account)
+                        })
+                        .transpose()?,
                    container_region: parse_toml_string("container_region", container_region)?,
                    prefix_in_container: toml
                        .get("prefix_in_container")
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -212,6 +212,7 @@ fn create_azure_client(
    let remote_storage_config = RemoteStorageConfig {
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
+            storage_account: None,
            container_region: remote_storage_azure_region,
            prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -7,7 +7,7 @@ license.workspace = true
 [dependencies]
 hyper.workspace = true
 opentelemetry = { workspace = true, features=["rt-tokio"] }
-opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions.workspace = true
 reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -25,6 +25,8 @@ pub struct Config {
    ///
    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
    memory_history_log_interval: usize,
+    /// The max number of iterations to skip before logging the next iteration
+    memory_history_log_noskip_interval: Duration,
 }

 impl Default for Config {
@@ -33,6 +35,7 @@ impl Default for Config {
            memory_poll_interval: Duration::from_millis(100),
            memory_history_len: 5, // use 500ms of history for decision-making
            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
+            memory_history_log_noskip_interval: Duration::from_secs(15), // but only if it's changed, or 60 seconds have passed
        }
    }
 }
@@ -85,7 +88,12 @@ impl CgroupWatcher {

        // buffer for samples that will be logged. once full, it remains so.
        let history_log_len = self.config.memory_history_log_interval;
+        let max_skip = self.config.memory_history_log_noskip_interval;
        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+        let mut last_logged_memusage = MemoryStatus::zeroed();
+
+        // Ensure that we're tracking a value that's definitely in the past, as Instant::now is only guaranteed to be non-decreasing on Rust's T1-supported systems.
+        let mut can_skip_logs_until = Instant::now() - max_skip;

        for t in 0_u64.. {
            ticker.tick().await;
@@ -115,12 +123,24 @@ impl CgroupWatcher {
            // equal to the logging interval, we can just log the entire buffer every time we set
            // the last entry, which also means that for this log line, we can ignore that it's a
            // ring buffer (because all the entries are in order of increasing time).
-            if i == history_log_len - 1 {
+            //
+            // We skip logging the data if data hasn't meaningfully changed in a while, unless
+            // we've already ignored previous iterations for the last max_skip period.
+            if i == history_log_len - 1
+                && (now > can_skip_logs_until
+                    || !history_log_buf
+                        .iter()
+                        .all(|usage| last_logged_memusage.status_is_close_or_similar(usage)))
+            {
                info!(
                    history = ?MemoryStatus::debug_slice(&history_log_buf),
                    summary = ?summary,
                    "Recent cgroup memory statistics history"
                );
+
+                can_skip_logs_until = now + max_skip;
+
+                last_logged_memusage = *history_log_buf.last().unwrap();
            }

            updates
@@ -232,6 +252,24 @@ impl MemoryStatus {

        DS(slice)
    }
+
+    /// Check if the other memory status is a close or similar result.
+    /// Returns true if the larger value is not larger than the smaller value
+    /// by 1/8 of the smaller value, and within 128MiB.
+    /// See tests::check_similarity_behaviour for examples of behaviour
+    fn status_is_close_or_similar(&self, other: &MemoryStatus) -> bool {
+        let margin;
+        let diff;
+        if self.non_reclaimable >= other.non_reclaimable {
+            margin = other.non_reclaimable / 8;
+            diff = self.non_reclaimable - other.non_reclaimable;
+        } else {
+            margin = self.non_reclaimable / 8;
+            diff = other.non_reclaimable - self.non_reclaimable;
+        }
+
+        diff < margin && diff < 128 * 1024 * 1024
+    }
 }

 #[cfg(test)]
@@ -261,4 +299,65 @@ mod tests {
        assert_eq!(values(2, 4), [9, 0, 1, 2]);
        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
    }
+
+    #[test]
+    fn check_similarity_behaviour() {
+        // This all accesses private methods, so we can't actually run this
+        // as doctests, because doctests run as an external crate.
+        let mut small = super::MemoryStatus {
+            non_reclaimable: 1024,
+        };
+        let mut large = super::MemoryStatus {
+            non_reclaimable: 1024 * 1024 * 1024 * 1024,
+        };
+
+        // objects are self-similar, no matter the size
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        // inequality is symmetric
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+
+        small.non_reclaimable = 64;
+        large.non_reclaimable = (small.non_reclaimable / 8) * 9;
+
+        // objects are self-similar, no matter the size
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        // values are similar if the larger value is larger by less than
+        // 12.5%, i.e. 1/8 of the smaller value.
+        // In the example above, large is exactly 12.5% larger, so this doesn't
+        // match.
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+
+        large.non_reclaimable -= 1;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(small.status_is_close_or_similar(&large));
+        assert!(large.status_is_close_or_similar(&small));
+
+        // The 1/8 rule only applies up to 128MiB of difference
+        small.non_reclaimable = 1024 * 1024 * 1024 * 1024;
+        large.non_reclaimable = small.non_reclaimable / 8 * 9;
+        assert!(small.status_is_close_or_similar(&small));
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+        // the large value is put just above the threshold
+        large.non_reclaimable = small.non_reclaimable + 128 * 1024 * 1024;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(!small.status_is_close_or_similar(&large));
+        assert!(!large.status_is_close_or_similar(&small));
+        // now below
+        large.non_reclaimable -= 1;
+        assert!(large.status_is_close_or_similar(&large));
+
+        assert!(small.status_is_close_or_similar(&large));
+        assert!(large.status_is_close_or_similar(&small));
+    }
 }
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,11 +12,11 @@ use futures::{
    stream::{SplitSink, SplitStream},
    SinkExt, StreamExt,
 };
-use tracing::info;
+use tracing::{debug, info};

 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
-    PROTOCOL_MIN_VERSION,
+    OutboundMsg, OutboundMsgKind, ProtocolRange, ProtocolResponse, ProtocolVersion,
+    PROTOCOL_MAX_VERSION, PROTOCOL_MIN_VERSION,
 };

 /// The central handler for all communications in the monitor.
@@ -118,7 +118,12 @@ impl Dispatcher {
    /// serialize the wrong thing and send it, since `self.sink.send` will take
    /// any string.
    pub async fn send(&mut self, message: OutboundMsg) -> anyhow::Result<()> {
-        info!(?message, "sending message");
+        if matches!(&message.inner, OutboundMsgKind::HealthCheck { .. }) {
+            debug!(?message, "sending message");
+        } else {
+            info!(?message, "sending message");
+        }
+
        let json = serde_json::to_string(&message).context("failed to serialize message")?;
        self.sink
            .send(Message::Text(json))
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -12,7 +12,7 @@ use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
 use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{debug, error, info, warn};

 use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
@@ -474,26 +474,29 @@ impl Runner {
                // there is a message from the agent
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
-                        // Don't use 'message' as a key as the string also uses
-                        // that for its key
-                        info!(?msg, "received message");
-                        match msg {
+                        match &msg {
                            Ok(msg) => {
                                let message: InboundMsg = match msg {
                                    Message::Text(text) => {
-                                        serde_json::from_str(&text).context("failed to deserialize text message")?
+                                        serde_json::from_str(text).context("failed to deserialize text message")?
                                    }
                                    other => {
                                        warn!(
                                            // Don't use 'message' as a key as the
                                            // string also uses that for its key
                                            msg = ?other,
-                                            "agent should only send text messages but received different type"
+                                            "problem processing incoming message: agent should only send text messages but received different type"
                                        );
                                        continue
                                    },
                                };

+                                if matches!(&message.inner, InboundMsgKind::HealthCheck { .. }) {
+                                    debug!(?msg, "received message");
+                                } else {
+                                    info!(?msg, "received message");
+                                }
+
                                let out = match self.process_message(message.clone()).await {
                                    Ok(Some(out)) => out,
                                    Ok(None) => continue,
@@ -517,7 +520,11 @@ impl Runner {
                                    .await
                                    .context("failed to send message")?;
                            }
-                            Err(e) => warn!("{e}"),
+                            Err(e) => warn!(
+                                error = format!("{e}"),
+                                msg = ?msg,
+                                "received error message"
+                            ),
                        }
                    } else {
                        anyhow::bail!("dispatcher connection closed")
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,11 +1,6 @@
-use std::collections::HashMap;
-
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::storage_layer::LayerName;
-use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
-use utils::lsn::Lsn;
+use pageserver::tenant::IndexPart;

 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
@@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
-            #[derive(serde::Serialize)]
-            struct Output<'a> {
-                layer_metadata: &'a HashMap<LayerName, LayerFileMetadata>,
-                disk_consistent_lsn: Lsn,
-                timeline_metadata: &'a TimelineMetadata,
-            }
-
-            let output = Output {
-                layer_metadata: &des.layer_metadata,
-                disk_consistent_lsn: des.metadata.disk_consistent_lsn(),
-                timeline_metadata: &des.metadata,
-            };
-
-            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
+            let output = serde_json::to_string_pretty(&des).context("serialize output")?;
            println!("{output}");
            Ok(())
        }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,10 +2,9 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{
-    mgr::TenantManager, LogicalSizeCalculationCause, PageReconstructError, Tenant,
-};
+use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -350,19 +349,12 @@ async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &Re
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
-    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
-        return;
-    };
-
-    // this error can be returned if timeline is shutting down, but it does not
-    // mean the synthetic size worker should terminate.
-    let shutting_down = matches!(
-        e.downcast_ref::<PageReconstructError>(),
-        Some(PageReconstructError::Cancelled)
-    );
-
-    if !shutting_down {
-        let tenant_shard_id = tenant.tenant_shard_id();
-        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+    match tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await {
+        Ok(_) => {}
+        Err(CalculateSyntheticSizeError::Cancelled) => {}
+        Err(e) => {
+            let tenant_shard_id = tenant.tenant_shard_id();
+            error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
+        }
    }
 }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1135,7 +1135,10 @@ async fn tenant_size_handler(
            &ctx,
        )
        .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| match e {
+            crate::tenant::size::CalculateSyntheticSizeError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;

    let mut sizes = None;
    let accepts_html = headers
@@ -1143,9 +1146,7 @@ async fn tenant_size_handler(
        .map(|v| v == "text/html")
        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
-        let storage_model = inputs
-            .calculate_model()
-            .map_err(ApiError::InternalServerError)?;
+        let storage_model = inputs.calculate_model();
        let size = storage_model.calculate();

        // If request header expects html, return html
@@ -1729,7 +1730,7 @@ async fn lsn_lease_handler(
        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
            .await?;
    let result = timeline
-        .make_lsn_lease(lsn, &ctx)
+        .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
        .map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;

    json_response(StatusCode::OK, result)
@@ -2429,6 +2430,25 @@ async fn list_aux_files(
    json_response(StatusCode::OK, files)
 }

+async fn perf_info(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let result = timeline.perf_info().await;
+
+    json_response(StatusCode::OK, result)
+}
+
 async fn ingest_aux_files(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -2856,5 +2876,9 @@ pub fn make_router(
            |r| testing_api_handler("list_aux_files", r, list_aux_files),
        )
        .post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
+            |r| testing_api_handler("perf_info", r, perf_info),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -935,7 +935,7 @@ impl PageServerHandler {
        let timeline = self
            .get_active_tenant_timeline(tenant_shard_id.tenant_id, timeline_id, shard_selector)
            .await?;
-        let lease = timeline.make_lsn_lease(lsn, ctx)?;
+        let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
        let valid_until = lease
            .valid_until
            .duration_since(SystemTime::UNIX_EPOCH)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -919,6 +919,14 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

+        #[cfg(test)]
+        {
+            let guard = self.extra_test_dense_keyspace.load();
+            for kr in &guard.ranges {
+                result.add_range(kr.clone());
+            }
+        }
+
        Ok((
            result.to_keyspace(),
            /* AUX sparse key space */
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -240,6 +240,7 @@ pub struct GcResult {
    pub layers_needed_by_cutoff: u64,
    pub layers_needed_by_pitr: u64,
    pub layers_needed_by_branches: u64,
+    pub layers_needed_by_leases: u64,
    pub layers_not_updated: u64,
    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.

@@ -269,6 +270,7 @@ impl AddAssign for GcResult {
        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
        self.layers_needed_by_branches += other.layers_needed_by_branches;
+        self.layers_needed_by_leases += other.layers_needed_by_leases;
        self.layers_not_updated += other.layers_not_updated;
        self.layers_removed += other.layers_removed;

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,6 +31,7 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::fmt;
+use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -65,9 +66,9 @@ use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::GcCutoffs;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
-use self::timeline::{GcCutoffs, GcInfo};
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -509,11 +510,24 @@ pub(crate) enum GcError {
    #[error(transparent)]
    Remote(anyhow::Error),

+    // An error reading while calculating GC cutoffs
+    #[error(transparent)]
+    GcCutoffs(PageReconstructError),
+
    // If GC was invoked for a particular timeline, this error means it didn't exist
    #[error("timeline not found")]
    TimelineNotFound,
 }

+impl From<PageReconstructError> for GcError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::TimelineCancelled,
+            other => Self::GcCutoffs(other),
+        }
+    }
+}
+
 impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
@@ -1033,7 +1047,6 @@ impl Tenant {
                remote_metadata,
                TimelineResources {
                    remote_client,
-                    deletion_queue_client: self.deletion_queue_client.clone(),
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                },
                ctx,
@@ -1059,7 +1072,6 @@ impl Tenant {
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
-                self.deletion_queue_client.clone(),
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
@@ -2417,6 +2429,13 @@ impl Tenant {
        }
    }

+    pub fn get_lsn_lease_length(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .lsn_lease_length
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+    }
+
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        // Use read-copy-update in order to avoid overwriting the location config
        // state if this races with [`Tenant::set_new_location_config`]. Note that
@@ -2921,17 +2940,9 @@ impl Tenant {
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

-            let res = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await;
-
-            match res {
-                Ok(cutoffs) => {
-                    let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
-                    assert!(old.is_none());
-                }
-                Err(e) => {
-                    tracing::warn!(timeline_id = %timeline.timeline_id, "ignoring failure to find gc cutoffs: {e:#}");
-                }
-            }
+            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
+            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
+            assert!(old.is_none());
        }

        if !self.is_active() || self.cancel.is_cancelled() {
@@ -3007,12 +3018,13 @@ impl Tenant {
            {
                let mut target = timeline.gc_info.write().unwrap();

+                let now = SystemTime::now();
+                target.leases.retain(|_, lease| !lease.is_expired(&now));
+
                match gc_cutoffs.remove(&timeline.timeline_id) {
                    Some(cutoffs) => {
-                        *target = GcInfo {
-                            retain_lsns: branchpoints,
-                            cutoffs,
-                        };
+                        target.retain_lsns = branchpoints;
+                        target.cutoffs = cutoffs;
                    }
                    None => {
                        // reasons for this being unavailable:
@@ -3395,6 +3407,12 @@ impl Tenant {
        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;

+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        unfinished_timeline.maybe_spawn_flush_loop();
+
        import_datadir::import_timeline_from_postgres_datadir(
            unfinished_timeline,
            &pgdata_path,
@@ -3406,12 +3424,6 @@ impl Tenant {
            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
        })?;

-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        unfinished_timeline.maybe_spawn_flush_loop();
-
        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            anyhow::bail!("failpoint before-checkpoint-new-timeline");
        });
@@ -3443,7 +3455,6 @@ impl Tenant {
        );
        TimelineResources {
            remote_client,
-            deletion_queue_client: self.deletion_queue_client.clone(),
            timeline_get_throttle: self.timeline_get_throttle.clone(),
        }
    }
@@ -3553,7 +3564,7 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<size::ModelInputs> {
+    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
        let logical_sizes_at_once = self
            .conf
            .concurrent_tenant_size_logical_size_queries
@@ -3568,8 +3579,8 @@ impl Tenant {
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = tokio::select! {
            locked = self.cached_logical_sizes.lock() => locked,
-            _ = cancel.cancelled() => anyhow::bail!("cancelled"),
-            _ = self.cancel.cancelled() => anyhow::bail!("tenant is shutting down"),
+            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
+            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
        };

        size::gather_inputs(
@@ -3593,10 +3604,10 @@ impl Tenant {
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, size::CalculateSyntheticSizeError> {
        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

-        let size = inputs.calculate()?;
+        let size = inputs.calculate();

        self.set_cached_synthetic_size(size);

@@ -3831,6 +3842,8 @@ pub(crate) mod harness {
                    tenant_conf.image_layer_creation_check_threshold,
                ),
                switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
+                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
+                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
            }
        }
    }
@@ -4041,13 +4054,16 @@ mod tests {
    use crate::repository::{Key, Value};
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
+    use crate::walrecord::NeonWalRecord;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
+    use itertools::Itertools;
    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use rand::{thread_rng, Rng};
+    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
    use utils::bin_ser::BeSer;
@@ -5262,6 +5278,9 @@ mod tests {
        let cancel = CancellationToken::new();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key_end = test_key;
+        test_key_end.field6 = NUM_KEYS as u32;
+        tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));

        let mut keyspace = KeySpaceAccum::new();

@@ -6221,8 +6240,8 @@ mod tests {

        let cancel = CancellationToken::new();

-        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-        base_key.field1 = AUX_KEY_PREFIX;
+        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
        let mut test_key = base_key;
        let mut lsn = Lsn(0x10);

@@ -6327,6 +6346,7 @@ mod tests {
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
+        tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));

        let child = tenant
            .branch_timeline_test_with_layers(
@@ -6584,8 +6604,8 @@ mod tests {
    }

    #[tokio::test]
-    async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_image_creation() {
+        let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap();
        let (tenant, ctx) = harness.load().await;

        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6613,7 +6633,8 @@ mod tests {
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x30),
            )
-            .await?;
+            .await
+            .unwrap();

        let cancel = CancellationToken::new();

@@ -6628,23 +6649,24 @@ mod tests {
                },
                &ctx,
            )
-            .await?;
+            .await
+            .unwrap();

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
-
-        Ok(())
    }

    #[tokio::test]
-    async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?;
+    async fn test_metadata_tombstone_empty_image_creation() {
+        let harness =
+            TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap();
        let (tenant, ctx) = harness.load().await;

        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
@@ -6666,7 +6688,8 @@ mod tests {
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x30),
            )
-            .await?;
+            .await
+            .unwrap();

        let cancel = CancellationToken::new();

@@ -6681,16 +6704,338 @@ mod tests {
                },
                &ctx,
            )
-            .await?;
+            .await
+            .unwrap();

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
-            .await?
+            .await
+            .unwrap()
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
+    }
+
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        //
+        //  | D1 |                       | D3 |
+        // -|    |-- gc horizon -----------------
+        //  |    |                | D2 |
+        // --------- img layer ------------------
+        //
+        // What we should expact from this compaction is:
+        //  | Part of D1 |               | D3 |
+        // --------- img layer with D1+D2 at GC horizon------------------
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), test_img(&format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose.
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::Image(test_img("value 1@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::Image(test_img("value 2@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::Image(test_img("value 3@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::Image(test_img("value 5@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::Image(test_img("value 6@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x40),
+                Value::Image(test_img("value 8@0x40")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x40),
+                Value::Image(test_img("value 9@0x40")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1, delta2, delta3], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            guard.cutoffs.pitr = Lsn(0x30);
+            guard.cutoffs.horizon = Lsn(0x30);
+        }
+
+        let cancel = CancellationToken::new();
+        tline.compact_with_gc(&cancel, &ctx).await.unwrap();
+
+        // Check if the image layer at the GC horizon contains exactly what we want
+        let image_at_gc_horizon = tline
+            .inspect_image_layers(Lsn(0x30), &ctx)
+            .await
+            .unwrap()
+            .into_iter()
+            .filter(|(k, _)| k.is_metadata_key())
+            .collect::<Vec<_>>();
+
+        assert_eq!(image_at_gc_horizon.len(), 10);
+        let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10];
+        for idx in 0..10 {
+            assert_eq!(
+                image_at_gc_horizon[idx],
+                (
+                    get_key(idx as u32),
+                    test_img(&format!("value {idx}@{:#x}", expected_lsn[idx]))
+                )
+            );
+        }
+
+        // Check if old layers are removed / new layers have the expected LSN
+        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
+        assert_eq!(
+            all_layers,
+            vec![
+                // Image layer at GC horizon
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(10),
+                    lsn_range: Lsn(0x30)..Lsn(0x31),
+                    is_delta: false
+                },
+                // The delta layer that is cut in the middle
+                PersistentLayerKey {
+                    key_range: Key::MIN..get_key(9),
+                    lsn_range: Lsn(0x30)..Lsn(0x41),
+                    is_delta: true
+                },
+                // The delta layer we created and should not be picked for the compaction
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    is_delta: true
+                }
+            ]
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_neon_test_record() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_neon_test_record")?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
+            ),
+            (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(2),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
+            ),
+            (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(3),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_clear()),
+            ),
+            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
+            (
+                get_key(4),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_init()),
+            ),
+        ];
+        let image1 = vec![(get_key(1), "0x10".into())];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![delta1],              // delta layers
+                vec![(Lsn(0x10), image1)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        assert_eq!(
+            tline.get(get_key(1), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"0x10,0x20,0x30")
+        );
+        assert_eq!(
+            tline.get(get_key(2), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"0x10,0x20,0x30")
+        );
+        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
+        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_lsn_lease() -> anyhow::Result<()> {
+        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")?.load().await;
+        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+
+        let end_lsn = Lsn(0x100);
+        let image_layers = (0x20..=0x90)
+            .step_by(0x10)
+            .map(|n| {
+                (
+                    Lsn(n),
+                    vec![(key, test_img(&format!("data key at {:x}", n)))],
+                )
+            })
+            .collect();
+
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                Vec::new(),
+                image_layers,
+                end_lsn,
+            )
+            .await?;
+
+        let leased_lsns = [0x30, 0x50, 0x70];
+        let mut leases = Vec::new();
+        let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
+            leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
+            Ok(())
+        });
+
+        // Renewing with shorter lease should not change the lease.
+        let updated_lease_0 =
+            timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
+        assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
+
+        // Renewing with a long lease should renew lease with later expiration time.
+        let updated_lease_1 = timeline.make_lsn_lease(
+            Lsn(leased_lsns[1]),
+            timeline.get_lsn_lease_length() * 2,
+            &ctx,
+        )?;
+
+        assert!(updated_lease_1.valid_until > leases[1].valid_until);
+
+        // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
+        info!(
+            "latest_gc_cutoff_lsn: {}",
+            *timeline.get_latest_gc_cutoff_lsn()
+        );
+        timeline.force_set_disk_consistent_lsn(end_lsn);
+
+        let res = tenant
+            .gc_iteration(
+                Some(TIMELINE_ID),
+                0,
+                Duration::ZERO,
+                &CancellationToken::new(),
+                &ctx,
+            )
+            .await?;
+
+        // Keeping everything <= Lsn(0x80) b/c leases:
+        // 0/10: initdb layer
+        // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline.
+        assert_eq!(res.layers_needed_by_leases, 7);
+        // Keeping 0/90 b/c it is the latest layer.
+        assert_eq!(res.layers_not_updated, 1);
+        // Removed 0/80.
+        assert_eq!(res.layers_removed, 1);
+
+        // Make lease on a already GC-ed LSN.
+        // 0/80 does not have a valid lease + is below latest_gc_cutoff
+        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
+        let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
+        assert!(res.is_err());
+
+        // Should still be able to renew a currently valid lease
+        // Assumption: original lease to is still valid for 0/50.
+        let _ =
+            timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;

        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -13,6 +13,7 @@ use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithm;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::LsnLease;
 use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
@@ -377,6 +378,16 @@ pub struct TenantConf {
    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
    /// file is written.
    pub switch_aux_file_policy: AuxFilePolicy,
+
+    /// The length for an explicit LSN lease request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length: Duration,
+
+    /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request.
+    /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
+    #[serde(with = "humantime_serde")]
+    pub lsn_lease_length_for_ts: Duration,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -476,6 +487,16 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub switch_aux_file_policy: Option<AuxFilePolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub lsn_lease_length: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    #[serde(default)]
+    pub lsn_lease_length_for_ts: Option<Duration>,
 }

 impl TenantConfOpt {
@@ -538,6 +559,12 @@ impl TenantConfOpt {
            switch_aux_file_policy: self
                .switch_aux_file_policy
                .unwrap_or(global_conf.switch_aux_file_policy),
+            lsn_lease_length: self
+                .lsn_lease_length
+                .unwrap_or(global_conf.lsn_lease_length),
+            lsn_lease_length_for_ts: self
+                .lsn_lease_length_for_ts
+                .unwrap_or(global_conf.lsn_lease_length_for_ts),
        }
    }
 }
@@ -582,6 +609,8 @@ impl Default for TenantConf {
            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
            image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
            switch_aux_file_policy: AuxFilePolicy::default_tenant_config(),
+            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
+            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
        }
    }
 }
@@ -657,6 +686,8 @@ impl From<TenantConfOpt> for models::TenantConfig {
            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
            switch_aux_file_policy: value.switch_aux_file_policy,
+            lsn_lease_length: value.lsn_lease_length.map(humantime),
+            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
        }
    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -22,7 +22,7 @@ use async_stream::try_stream;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use either::Either;
-use futures::Stream;
+use futures::{Stream, StreamExt};
 use hex;
 use std::{
    cmp::Ordering,
@@ -259,6 +259,16 @@ where
        Ok(result)
    }

+    pub fn iter<'a>(
+        &'a self,
+        start_key: &'a [u8; L],
+        ctx: &'a RequestContext,
+    ) -> DiskBtreeIterator<'a> {
+        DiskBtreeIterator {
+            stream: Box::pin(self.get_stream_from(start_key, ctx)),
+        }
+    }
+
    /// Return a stream which yields all key, value pairs from the index
    /// starting from the first key greater or equal to `start_key`.
    ///
@@ -496,6 +506,19 @@ where
    }
 }

+pub struct DiskBtreeIterator<'a> {
+    #[allow(clippy::type_complexity)]
+    stream: std::pin::Pin<
+        Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
+    >,
+}
+
+impl<'a> DiskBtreeIterator<'a> {
+    pub async fn next(&mut self) -> Option<std::result::Result<(Vec<u8>, u64), DiskBtreeError>> {
+        self.stream.next().await
+    }
+}
+
 ///
 /// Public builder object, for creating a new tree.
 ///
@@ -1088,6 +1111,17 @@ pub(crate) mod tests {
                == all_data.get(&u128::MAX).cloned()
        );

+        // Test iterator and get_stream API
+        let mut iter = reader.iter(&[0; 16], &ctx);
+        let mut cnt = 0;
+        while let Some(res) = iter.next().await {
+            let (key, val) = res?;
+            let key = u128::from_be_bytes(key.as_slice().try_into().unwrap());
+            assert_eq!(val, *all_data.get(&key).unwrap());
+            cnt += 1;
+        }
+        assert_eq!(cnt, all_data.len());
+
        Ok(())
    }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -1,15 +1,23 @@
-//! Every image of a certain timeline from [`crate::tenant::Tenant`]
-//! has a metadata that needs to be stored persistently.
+//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in
+//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines,
+//! this struct and it's original serialization format is still needed because they were written a
+//! long time ago.
 //!
-//! Later, the file gets used in [`remote_timeline_client`] as a part of
-//! external storage import and export operations.
+//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json
+//! versioning.
 //!
-//! The module contains all structs and related helper methods related to timeline metadata.
+//! To clean up this module we need to migrate all index_part.json files to a later version.
+//! While doing this, we need to be mindful about s3 based recovery as well, so it might take
+//! however long we keep the old versions to be able to delete the old code. After that, we can
+//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and
+//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards
+//! compatibility.
 //!
 //! [`remote_timeline_client`]: super::remote_timeline_client
+//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart

 use anyhow::ensure;
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use utils::bin_ser::SerializeError;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};

@@ -17,17 +25,37 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
 const METADATA_FORMAT_VERSION: u16 = 4;

 /// Previous supported format versions.
+///
+/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming
+/// that requires a scrubber run which is yet to be done.
 const METADATA_OLD_FORMAT_VERSION: u16 = 3;

-/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
+/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic.
 ///
 /// This is the same assumption that PostgreSQL makes with the control file,
+///
 /// see PG_CONTROL_MAX_SAFE_SIZE
 const METADATA_MAX_SIZE: usize = 512;

-/// Metadata stored on disk for each timeline
+/// Legacy metadata stored as a component of `index_part.json` per timeline.
 ///
-/// The fields correspond to the values we hold in memory, in Timeline.
+/// Do not make new changes to this type or the module. In production, we have two different kinds
+/// of serializations of this type: bincode and json. Bincode version reflects what used to be
+/// stored on disk in earlier versions and does internal crc32 checksumming.
+///
+/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would
+/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern
+/// as-exists in `index_part.json` ([`self::modern_serde`]).
+///
+/// ```compile_fail
+/// #[derive(serde::Serialize)]
+/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
+///
+/// ```compile_fail
+/// #[derive(serde::Deserialize)]
+/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata);
+/// ```
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TimelineMetadata {
    hdr: TimelineMetadataHeader,
@@ -40,6 +68,49 @@ struct TimelineMetadataHeader {
    size: u16,           // size of serialized metadata
    format_version: u16, // metadata format version (used for compatibility checks)
 }
+
+impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader {
+    type Error = Crc32CalculationFailed;
+
+    fn try_from(value: &TimelineMetadataBodyV2) -> Result<Self, Self::Error> {
+        #[derive(Default)]
+        struct Crc32Sink {
+            crc: u32,
+            count: usize,
+        }
+
+        impl std::io::Write for Crc32Sink {
+            fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+                self.crc = crc32c::crc32c_append(self.crc, buf);
+                self.count += buf.len();
+                Ok(buf.len())
+            }
+
+            fn flush(&mut self) -> std::io::Result<()> {
+                Ok(())
+            }
+        }
+
+        // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
+        // across serialization versions
+        let mut sink = Crc32Sink::default();
+        <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(value, &mut sink)
+            .map_err(Crc32CalculationFailed)?;
+
+        let size = METADATA_HDR_SIZE + sink.count;
+
+        Ok(TimelineMetadataHeader {
+            checksum: sink.crc,
+            size: size as u16,
+            format_version: METADATA_FORMAT_VERSION,
+        })
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+#[error("re-serializing for crc32 failed")]
+struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError);
+
 const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -111,6 +182,12 @@ impl TimelineMetadata {
        }
    }

+    #[cfg(test)]
+    pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result<Self> {
+        self.hdr = TimelineMetadataHeader::try_from(&self.body)?;
+        Ok(self)
+    }
+
    fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
        let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;

@@ -261,32 +338,8 @@ impl TimelineMetadata {
    }
 }

-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom)
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?;
-        bytes.serialize(serializer)
-    }
-}
-
 pub(crate) mod modern_serde {
-    use crate::tenant::metadata::METADATA_FORMAT_VERSION;
-
-    use super::{
-        TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE,
-    };
+    use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader};
    use serde::{Deserialize, Serialize};

    pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result<TimelineMetadata, D::Error>
@@ -322,71 +375,15 @@ pub(crate) mod modern_serde {

                let de = serde::de::value::MapAccessDeserializer::new(map);
                let body = TimelineMetadataBodyV2::deserialize(de)?;
+                let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?;

-                // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works
-                // across serialization versions
-                let mut sink = Crc32Sink::default();
-                <TimelineMetadataBodyV2 as utils::bin_ser::BeSer>::ser_into(&body, &mut sink)
-                    .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?;
-
-                let size = METADATA_HDR_SIZE + sink.count;
-
-                Ok(TimelineMetadata {
-                    hdr: TimelineMetadataHeader {
-                        checksum: sink.crc,
-                        size: size as u16,
-                        format_version: METADATA_FORMAT_VERSION,
-                    },
-                    body,
-                })
+                Ok(TimelineMetadata { hdr, body })
            }
        }

        deserializer.deserialize_any(Visitor)
    }

-    #[derive(Default)]
-    struct Crc32Sink {
-        crc: u32,
-        count: usize,
-    }
-
-    impl std::io::Write for Crc32Sink {
-        fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
-            self.crc = crc32c::crc32c_append(self.crc, buf);
-            self.count += buf.len();
-            Ok(buf.len())
-        }
-
-        fn flush(&mut self) -> std::io::Result<()> {
-            Ok(())
-        }
-    }
-
-    #[derive(thiserror::Error)]
-    #[error("re-serializing for crc32 failed")]
-    struct Crc32CalculationFailed<E>(#[source] E);
-
-    // this should be true for one release, after that we can change it to false
-    // remember to check the IndexPart::metadata field TODO comment as well
-    const LEGACY_BINCODED_BYTES: bool = true;
-
-    #[derive(serde::Serialize)]
-    #[serde(transparent)]
-    struct LegacyPaddedBytes<'a>(&'a TimelineMetadata);
-
-    struct JustTheBodyV2<'a>(&'a TimelineMetadata);
-
-    impl serde::Serialize for JustTheBodyV2<'_> {
-        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-        where
-            S: serde::Serializer,
-        {
-            // header is not needed, upon reading we've upgraded all v1 to v2
-            self.0.body.serialize(serializer)
-        }
-    }
-
    pub(crate) fn serialize<S>(
        metadata: &TimelineMetadata,
        serializer: S,
@@ -394,25 +391,23 @@ pub(crate) mod modern_serde {
    where
        S: serde::Serializer,
    {
-        // we cannot use TimelineMetadata::serialize for now because it'll do
-        // TimelineMetadata::to_bytes
-        if LEGACY_BINCODED_BYTES {
-            LegacyPaddedBytes(metadata).serialize(serializer)
-        } else {
-            JustTheBodyV2(metadata).serialize(serializer)
-        }
+        // header is not needed, upon reading we've upgraded all v1 to v2
+        metadata.body.serialize(serializer)
    }

    #[test]
    fn deserializes_bytes_as_well_as_equivalent_body_v2() {
        #[derive(serde::Deserialize, serde::Serialize)]
-        struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata);
+        struct Wrapper(
+            #[serde(deserialize_with = "deserialize", serialize_with = "serialize")]
+            TimelineMetadata,
+        );

        let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]";

        let wrapper_from_bytes = serde_json::from_str::<Wrapper>(too_many_bytes).unwrap();

-        let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap();
+        let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap();

        assert_eq!(
            serialized,
@@ -553,59 +548,6 @@ mod tests {
        );
    }

-    #[test]
-    fn test_metadata_bincode_serde() {
-        let original_metadata = TimelineMetadata::new(
-            Lsn(0x200),
-            Some(Lsn(0x100)),
-            Some(TIMELINE_ID),
-            Lsn(0),
-            Lsn(0),
-            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
-        );
-        let metadata_bytes = original_metadata
-            .to_bytes()
-            .expect("Cannot create bytes array from metadata");
-
-        let metadata_bincode_be_bytes = original_metadata
-            .ser()
-            .expect("Cannot serialize the metadata");
-
-        // 8 bytes for the length of the vector
-        assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
-
-        let expected_bincode_bytes = {
-            let mut temp = vec![];
-            let len_bytes = metadata_bytes.len().to_be_bytes();
-            temp.extend_from_slice(&len_bytes);
-            temp.extend_from_slice(&metadata_bytes);
-            temp
-        };
-        assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
-
-        let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
-        // Deserialized metadata has the metadata header, which is different from the serialized one.
-        //   Reference: TimelineMetaData::to_bytes()
-        let expected_metadata = {
-            let mut temp_metadata = original_metadata;
-            let body_bytes = temp_metadata
-                .body
-                .ser()
-                .expect("Cannot serialize the metadata body");
-            let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
-            let hdr = TimelineMetadataHeader {
-                size: metadata_size as u16,
-                format_version: METADATA_FORMAT_VERSION,
-                checksum: crc32c::crc32c(&body_bytes),
-            };
-            temp_metadata.hdr = hdr;
-            temp_metadata
-        };
-        assert_eq!(deserialized_metadata, expected_metadata);
-    }
-
    #[test]
    fn test_metadata_bincode_serde_ensure_roundtrip() {
        let original_metadata = TimelineMetadata::new(
@@ -619,8 +561,6 @@ mod tests {
            crate::DEFAULT_PG_VERSION,
        );
        let expected_bytes = vec![
-            /* bincode length encoding bytes */
-            0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
            /* TimelineMetadataHeader */
            4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
            /* TimelineMetadataBodyV2 */
@@ -650,7 +590,7 @@ mod tests {
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0,
        ];
-        let metadata_ser_bytes = original_metadata.ser().unwrap();
+        let metadata_ser_bytes = original_metadata.to_bytes().unwrap();
        assert_eq!(metadata_ser_bytes, expected_bytes);

        let expected_metadata = {
@@ -668,7 +608,7 @@ mod tests {
            temp_metadata.hdr = hdr;
            temp_metadata
        };
-        let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
+        let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap();
        assert_eq!(des_metadata, expected_metadata);
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -38,14 +38,17 @@ pub struct IndexPart {
    /// that latest version stores.
    pub layer_metadata: HashMap<LayerName, LayerFileMetadata>,

-    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
-    // It's duplicated for convenience when reading the serialized structure, but is
-    // private because internally we would read from metadata instead.
+    /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the
+    /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be
+    /// reused.
    pub(super) disk_consistent_lsn: Lsn,

-    // TODO: later make this "rename" to "alias", rename field as "legacy_metadata"
+    // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding
+    // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes"
+    // for backwards compatibility.
    #[serde(
        rename = "metadata_bytes",
+        alias = "metadata",
        with = "crate::tenant::metadata::modern_serde"
    )]
    pub metadata: TimelineMetadata,
@@ -76,10 +79,11 @@ impl IndexPart {
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
-    const LATEST_VERSION: usize = 6;
+    /// - 7: metadata_bytes is no longer written, but still read
+    const LATEST_VERSION: usize = 7;

    // Versions we may see when reading from a bucket.
-    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6];
+    pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7];

    pub const FILE_NAME: &'static str = "index_part.json";

@@ -95,7 +99,7 @@ impl IndexPart {
        }
    }

-    pub fn get_version(&self) -> usize {
+    pub fn version(&self) -> usize {
        self.version
    }

@@ -217,9 +221,9 @@ impl Lineage {

 #[cfg(test)]
 mod tests {
-    use std::str::FromStr;
-
    use super::*;
+    use std::str::FromStr;
+    use utils::id::TimelineId;

    #[test]
    fn v1_indexpart_is_parsed() {
@@ -338,8 +342,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            lineage: Lineage::default(),
            last_aux_file_policy: None,
        };
@@ -515,8 +518,7 @@ mod tests {
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            lineage: Lineage {
                reparenting_history_truncated: false,
                reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()],
@@ -529,6 +531,60 @@ mod tests {
        assert_eq!(part, expected);
    }

+    #[test]
+    fn v7_indexpart_is_parsed() {
+        let example = r#"{
+            "version": 7,
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata": {
+                "disk_consistent_lsn": "0/16960E8",
+                "prev_record_lsn": "0/1696070",
+                "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e",
+                "ancestor_lsn": "0/0",
+                "latest_gc_cutoff_lsn": "0/1696070",
+                "initdb_lsn": "0/1696070",
+                "pg_version": 14
+            },
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            version: 7,
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata {
+                    file_size: 25600000,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata {
+                    file_size: 9007199254741001,
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata: TimelineMetadata::new(
+                Lsn::from_str("0/16960E8").unwrap(),
+                Some(Lsn::from_str("0/1696070").unwrap()),
+                Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()),
+                Lsn::INVALID,
+                Lsn::from_str("0/1696070").unwrap(),
+                Lsn::from_str("0/1696070").unwrap(),
+                14,
+            ).with_recalculated_checksum().unwrap(),
+            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
+            lineage: Default::default(),
+            last_aux_file_policy: Default::default(),
+        };
+
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        assert_eq!(part, expected);
+    }
+
    fn parse_naive_datetime(s: &str) -> NaiveDateTime {
        chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap()
    }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -513,7 +513,7 @@ impl<'a> TenantDownloader<'a> {
        // cover our access to local storage.
        let Ok(_guard) = self.secondary_state.gate.enter() else {
            // Shutting down
-            return Ok(());
+            return Err(UpdateError::Cancelled);
        };

        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
@@ -846,7 +846,7 @@ impl<'a> TenantDownloader<'a> {
        for layer in timeline.layers {
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Ok(());
+                return Err(UpdateError::Cancelled);
            }

            // Existing on-disk layers: just update their access time.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,7 +3,6 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;

-use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -11,7 +10,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;

-use super::{LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -43,6 +42,40 @@ pub struct SegmentMeta {
    pub kind: LsnKind,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum CalculateSyntheticSizeError {
+    /// Something went wrong internally to the calculation of logical size at a particular branch point
+    #[error("Failed to calculated logical size on timeline {timeline_id} at {lsn}: {error}")]
+    LogicalSize {
+        timeline_id: TimelineId,
+        lsn: Lsn,
+        error: CalculateLogicalSizeError,
+    },
+
+    /// Something went wrong internally when calculating GC parameters at start of size calculation
+    #[error(transparent)]
+    GcInfo(GcError),
+
+    /// Totally unexpected errors, like panics joining a task
+    #[error(transparent)]
+    Fatal(anyhow::Error),
+
+    /// Tenant shut down while calculating size
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+impl From<GcError> for CalculateSyntheticSizeError {
+    fn from(value: GcError) -> Self {
+        match value {
+            GcError::TenantCancelled | GcError::TimelineCancelled => {
+                CalculateSyntheticSizeError::Cancelled
+            }
+            other => CalculateSyntheticSizeError::GcInfo(other),
+        }
+    }
+}
+
 impl SegmentMeta {
    fn size_needed(&self) -> bool {
        match self.kind {
@@ -116,12 +149,9 @@ pub(super) async fn gather_inputs(
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
-) -> anyhow::Result<ModelInputs> {
+) -> Result<ModelInputs, CalculateSyntheticSizeError> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
-    tenant
-        .refresh_gc_info(cancel, ctx)
-        .await
-        .context("Failed to refresh gc_info before gathering inputs")?;
+    tenant.refresh_gc_info(cancel, ctx).await?;

    // Collect information about all the timelines
    let mut timelines = tenant.list_timelines();
@@ -327,6 +357,12 @@ pub(super) async fn gather_inputs(
    )
    .await?;

+    if tenant.cancel.is_cancelled() {
+        // If we're shutting down, return an error rather than a sparse result that might include some
+        // timelines from before we started shutting down
+        return Err(CalculateSyntheticSizeError::Cancelled);
+    }
+
    Ok(ModelInputs {
        segments,
        timeline_inputs,
@@ -335,9 +371,8 @@ pub(super) async fn gather_inputs(

 /// Augment 'segments' with logical sizes
 ///
-/// this will probably conflict with on-demand downloaded layers, or at least force them all
-/// to be downloaded
-///
+/// This will leave segments' sizes as None if the Timeline associated with the segment is deleted concurrently
+/// (i.e. we cannot read its logical size at a particular LSN).
 async fn fill_logical_sizes(
    timelines: &[Arc<Timeline>],
    segments: &mut [SegmentMeta],
@@ -345,7 +380,7 @@ async fn fill_logical_sizes(
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    ctx: &RequestContext,
-) -> anyhow::Result<()> {
+) -> Result<(), CalculateSyntheticSizeError> {
    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
        timelines
            .iter()
@@ -387,7 +422,7 @@ async fn fill_logical_sizes(
    }

    // Perform the size lookups
-    let mut have_any_error = false;
+    let mut have_any_error = None;
    while let Some(res) = joinset.join_next().await {
        // each of these come with Result<anyhow::Result<_>, JoinError>
        // because of spawn + spawn_blocking
@@ -398,21 +433,36 @@ async fn fill_logical_sizes(
            Err(join_error) => {
                // cannot really do anything, as this panic is likely a bug
                error!("task that calls spawn_ondemand_logical_size_calculation panicked: {join_error:#}");
-                have_any_error = true;
+
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(join_error)
+                        .context("task that calls spawn_ondemand_logical_size_calculation"),
+                ));
            }
            Ok(Err(recv_result_error)) => {
                // cannot really do anything, as this panic is likely a bug
                error!("failed to receive logical size query result: {recv_result_error:#}");
-                have_any_error = true;
+                have_any_error = Some(CalculateSyntheticSizeError::Fatal(
+                    anyhow::anyhow!(recv_result_error)
+                        .context("Receiving logical size query result"),
+                ));
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
+                if matches!(error, CalculateLogicalSizeError::Cancelled) {
+                    // Skip this: it's okay if one timeline among many is shutting down while we
+                    // calculate inputs for the overall tenant.
+                    continue;
+                } else {
                    warn!(
                        timeline_id=%timeline.timeline_id,
                        "failed to calculate logical size at {lsn}: {error:#}"
                    );
+                    have_any_error = Some(CalculateSyntheticSizeError::LogicalSize {
+                        timeline_id: timeline.timeline_id,
+                        lsn,
+                        error,
+                    });
                }
-                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
                debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
@@ -426,10 +476,10 @@ async fn fill_logical_sizes(
    // prune any keys not needed anymore; we record every used key and added key.
    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));

-    if have_any_error {
+    if let Some(error) = have_any_error {
        // we cannot complete this round, because we are missing data.
        // we have however cached all we were able to request calculation on.
-        anyhow::bail!("failed to calculate some logical_sizes");
+        return Err(error);
    }

    // Insert the looked up sizes to the Segments
@@ -443,33 +493,28 @@ async fn fill_logical_sizes(

        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
            seg.segment.size = Some(*size);
-        } else {
-            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
        }
    }
    Ok(())
 }

 impl ModelInputs {
-    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+    pub fn calculate_model(&self) -> tenant_size_model::StorageModel {
        // Convert SegmentMetas into plain Segments
-        let storage = StorageModel {
+        StorageModel {
            segments: self
                .segments
                .iter()
                .map(|seg| seg.segment.clone())
                .collect(),
-        };
-
-        Ok(storage)
+        }
    }

    // calculate total project size
-    pub fn calculate(&self) -> anyhow::Result<u64> {
-        let storage = self.calculate_model()?;
+    pub fn calculate(&self) -> u64 {
+        let storage = self.calculate_model();
        let sizes = storage.calculate();
-
-        Ok(sizes.total_size)
+        sizes.total_size
    }
 }

@@ -656,7 +701,7 @@ fn verify_size_for_multiple_branches() {
 "#;
    let inputs: ModelInputs = serde_json::from_str(doc).unwrap();

-    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
+    assert_eq!(inputs.calculate(), 37_851_408);
 }

 #[test]
@@ -711,7 +756,7 @@ fn verify_size_for_one_branch() {

    let model: ModelInputs = serde_json::from_str(doc).unwrap();

-    let res = model.calculate_model().unwrap().calculate();
+    let res = model.calculate_model().calculate();

    println!("calculated synthetic size: {}", res.total_size);
    println!("result: {:?}", serde_json::to_string(&res.segments));
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -318,7 +318,7 @@ pub(crate) struct LayerFringe {
 #[derive(Debug)]
 struct LayerKeyspace {
    layer: ReadableLayer,
-    target_keyspace: Vec<KeySpace>,
+    target_keyspace: KeySpaceRandomAccum,
 }

 impl LayerFringe {
@@ -342,17 +342,13 @@ impl LayerFringe {
                _,
                LayerKeyspace {
                    layer,
-                    target_keyspace,
+                    mut target_keyspace,
                },
-            )) => {
-                let mut keyspace = KeySpaceRandomAccum::new();
-                for ks in target_keyspace {
-                    for part in ks.ranges {
-                        keyspace.add_range(part);
-                    }
-                }
-                Some((layer, keyspace.consume_keyspace(), read_desc.lsn_range))
-            }
+            )) => Some((
+                layer,
+                target_keyspace.consume_keyspace(),
+                read_desc.lsn_range,
+            )),
            None => unreachable!("fringe internals are always consistent"),
        }
    }
@@ -367,16 +363,18 @@ impl LayerFringe {
        let entry = self.layers.entry(layer_id.clone());
        match entry {
            Entry::Occupied(mut entry) => {
-                entry.get_mut().target_keyspace.push(keyspace);
+                entry.get_mut().target_keyspace.add_keyspace(keyspace);
            }
            Entry::Vacant(entry) => {
                self.planned_reads_by_lsn.push(ReadDesc {
                    lsn_range,
                    layer_id: layer_id.clone(),
                });
+                let mut accum = KeySpaceRandomAccum::new();
+                accum.add_keyspace(keyspace);
                entry.insert(LayerKeyspace {
                    layer,
-                    target_keyspace: vec![keyspace],
+                    target_keyspace: accum,
                });
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -219,7 +219,6 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
-    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -785,7 +784,6 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -911,7 +909,7 @@ impl DeltaLayerInner {

        let reads = Self::plan_reads(
            &keyspace,
-            lsn_range,
+            lsn_range.clone(),
            data_end_offset,
            index_reader,
            planner,
@@ -924,11 +922,50 @@ impl DeltaLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
+        reconstruct_state.on_lsn_advanced(&keyspace, lsn_range.start);

        Ok(())
    }

+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+        let mut result = Vec::new();
+        let mut stream =
+            Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        let mut buf = Vec::new();
+        while let Some(item) = stream.next().await {
+            let (key, lsn, pos) = item?;
+            // TODO: dedup code with get_reconstruct_value
+            // TODO: ctx handling and sharding
+            cursor
+                .read_blob_into_buf(pos.pos(), &mut buf, ctx)
+                .await
+                .with_context(|| {
+                    format!("Failed to read blob from virtual file {}", self.file.path)
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    self.file.path
+                )
+            })?;
+            result.push((key, lsn, val));
+        }
+        Ok(result)
+    }
+
    async fn plan_reads<Reader>(
        keyspace: &KeySpace,
        lsn_range: Range<Lsn>,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -485,6 +485,34 @@ impl ImageLayerInner {
        Ok(())
    }

+    /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future.
+    #[cfg(test)]
+    pub(super) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, Value)>> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
+        let mut result = Vec::new();
+        let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx));
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let cursor = block_reader.block_cursor();
+        while let Some(item) = stream.next().await {
+            // TODO: dedup code with get_reconstruct_value
+            let (raw_key, offset) = item?;
+            let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+            // TODO: ctx handling and sharding
+            let blob = cursor
+                .read_blob(offset, ctx)
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+            result.push((key, self.lsn, Value::Image(value)));
+        }
+        Ok(result)
+    }
+
    /// Traverse the layer's index to build read operations on the overlap of the input keyspace
    /// and the keys in this layer.
    ///
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -52,7 +52,7 @@ pub struct InMemoryLayer {

    /// Frozen layers have an exclusive end LSN.
    /// Writes are only allowed when this is `None`.
-    end_lsn: OnceLock<Lsn>,
+    pub(crate) end_lsn: OnceLock<Lsn>,

    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
    local_path_str: Arc<str>,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -388,6 +388,23 @@ impl Layer {
            })
    }

+    /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
+    #[cfg(test)]
+    pub(crate) async fn load_key_values(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        let layer = self
+            .0
+            .get_or_maybe_download(true, Some(ctx))
+            .await
+            .map_err(|err| match err {
+                DownloadError::DownloadCancelled => GetVectoredError::Cancelled,
+                other => GetVectoredError::Other(anyhow::anyhow!(other)),
+            })?;
+        layer.load_key_values(&self.0, ctx).await
+    }
+
    /// Download the layer if evicted.
    ///
    /// Will not error when the layer is already downloaded.
@@ -1757,6 +1774,20 @@ impl DownloadedLayer {
        }
    }

+    #[cfg(test)]
+    async fn load_key_values(
+        &self,
+        owner: &Arc<LayerInner>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<(Key, Lsn, crate::repository::Value)>> {
+        use LayerKind::*;
+
+        match self.get(owner, ctx).await? {
+            Delta(d) => d.load_key_values(ctx).await,
+            Image(i) => i.load_key_values(ctx).await,
+        }
+    }
+
    async fn dump(&self, owner: &Arc<LayerInner>, ctx: &RequestContext) -> anyhow::Result<()> {
        use LayerKind::*;
        match self.get(owner, ctx).await? {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -346,6 +346,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
        // cutoff specified as time.
        let ctx =
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+
        let mut first = true;
        loop {
            tokio::select! {
@@ -362,6 +363,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            if first {
                first = false;
+
+                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
+                    .await
+                    .is_err()
+                {
+                    break;
+                }
+
                if random_init_delay(period, &cancel).await.is_err() {
                    break;
                }
@@ -531,6 +540,21 @@ pub(crate) async fn random_init_delay(
    }
 }

+/// Delays GC by defaul lease length at restart.
+///
+/// We do this as the leases mapping are not persisted to disk. By delaying GC by default
+/// length, we gurantees that all the leases we granted before the restart will expire
+/// when we run GC for the first time after the restart.
+pub(crate) async fn delay_by_lease_length(
+    length: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    match tokio::time::timeout(length, cancel.cancelled()).await {
+        Ok(_) => Err(Cancelled),
+        Err(_) => Ok(()),
+    }
+}
+
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(
    elapsed: Duration,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+pub(crate) mod analysis;
 mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
@@ -46,7 +47,6 @@ use utils::{
    vec_map::VecMap,
 };

-use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -60,7 +60,12 @@ use std::{
    cmp::{max, min, Ordering},
    ops::ControlFlow,
 };
+use std::{
+    collections::btree_map::Entry,
+    ops::{Deref, Range},
+};

+use crate::metrics::GetKind;
 use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
    aux_file::AuxFileSizeEstimator,
@@ -74,7 +79,6 @@ use crate::{
    disk_usage_eviction_task::DiskUsageEvictionInfo,
    pgdatadir_mapping::CollectKeySpaceError,
 };
-use crate::{deletion_queue::DeletionQueueClient, metrics::GetKind};
 use crate::{
    disk_usage_eviction_task::finite_f32,
    tenant::storage_layer::{
@@ -204,7 +208,6 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
    pub remote_client: RemoteTimelineClient,
-    pub deletion_queue_client: DeletionQueueClient,
    pub timeline_get_throttle: Arc<
        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
    >,
@@ -321,6 +324,8 @@ pub struct Timeline {
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
    /// Must always be acquired before the layer map/individual layer lock
    /// to avoid deadlock.
+    ///
+    /// The state is cleared upon freezing.
    write_lock: tokio::sync::Mutex<Option<TimelineWriterState>>,

    /// Used to avoid multiple `flush_loop` tasks running
@@ -423,6 +428,14 @@ pub struct Timeline {

    /// Indicate whether aux file v2 storage is enabled.
    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
+
+    /// Some test cases directly place keys into the timeline without actually modifying the directory
+    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
+    /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
+    /// keyspace return value of `collect_keyspace`. For sparse keyspaces, use AUX keys for testing, and
+    /// in the future, add `extra_test_sparse_keyspace` if necessary.
+    #[cfg(test)]
+    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
 }

 pub struct WalReceiverInfo {
@@ -444,6 +457,9 @@ pub(crate) struct GcInfo {

    /// The cutoff coordinates, which are combined by selecting the minimum.
    pub(crate) cutoffs: GcCutoffs,
+
+    /// Leases granted to particular LSNs.
+    pub(crate) leases: BTreeMap<Lsn, LsnLease>,
 }

 impl GcInfo {
@@ -1545,17 +1561,46 @@ impl Timeline {
        Ok(())
    }

-    /// Obtains a temporary lease blocking garbage collection for the given LSN
+    /// Obtains a temporary lease blocking garbage collection for the given LSN.
+    ///
+    /// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
+    /// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
+    /// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
+    /// the requesting lease.
    pub(crate) fn make_lsn_lease(
        &self,
-        _lsn: Lsn,
+        lsn: Lsn,
+        length: Duration,
        _ctx: &RequestContext,
    ) -> anyhow::Result<LsnLease> {
-        const LEASE_LENGTH: Duration = Duration::from_secs(5 * 60);
-        let lease = LsnLease {
-            valid_until: SystemTime::now() + LEASE_LENGTH,
+        let lease = {
+            let mut gc_info = self.gc_info.write().unwrap();
+
+            let valid_until = SystemTime::now() + length;
+
+            let entry = gc_info.leases.entry(lsn);
+
+            let lease = {
+                if let Entry::Occupied(mut occupied) = entry {
+                    let existing_lease = occupied.get_mut();
+                    if valid_until > existing_lease.valid_until {
+                        existing_lease.valid_until = valid_until;
+                    }
+                    existing_lease.clone()
+                } else {
+                    // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
+                    let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                    if lsn < *latest_gc_cutoff_lsn {
+                        bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
+                    }
+
+                    entry.or_insert(LsnLease { valid_until }).clone()
+                }
+            };
+
+            lease
        };
-        // TODO: dummy implementation
+
        Ok(lease)
    }

@@ -1568,7 +1613,15 @@ impl Timeline {
    // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
    // polluting the span hierarchy.
    pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let to_lsn = self.freeze_inmem_layer(false).await;
+        let to_lsn = {
+            // Freeze the current open in-memory layer. It will be written to disk on next
+            // iteration.
+            let mut g = self.write_lock.lock().await;
+
+            let to_lsn = self.get_last_record_lsn();
+            self.freeze_inmem_layer_at(to_lsn, &mut g).await;
+            to_lsn
+        };
        self.flush_frozen_layers_and_wait(to_lsn).await
    }

@@ -1577,7 +1630,7 @@ impl Timeline {
    // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
    // ephemeral layer bytes has been breached.
    pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
-        let Ok(_write_guard) = self.write_lock.try_lock() else {
+        let Ok(mut write_guard) = self.write_lock.try_lock() else {
            // If the write lock is held, there is an active wal receiver: rolling open layers
            // is their responsibility while they hold this lock.
            return;
@@ -1654,24 +1707,35 @@ impl Timeline {
            self.last_freeze_at.load(),
            open_layer.get_opened_at(),
        ) {
-            match open_layer.info() {
+            let at_lsn = match open_layer.info() {
                InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => {
                    // We may reach this point if the layer was already frozen by not yet flushed: flushing
                    // happens asynchronously in the background.
                    tracing::debug!(
                        "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})"
                    );
+                    None
                }
                InMemoryLayerInfo::Open { .. } => {
                    // Upgrade to a write lock and freeze the layer
                    drop(layers_guard);
                    let mut layers_guard = self.layers.write().await;
-                    layers_guard
-                        .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at)
+                    let froze = layers_guard
+                        .try_freeze_in_memory_layer(
+                            current_lsn,
+                            &self.last_freeze_at,
+                            &mut write_guard,
+                        )
                        .await;
+                    Some(current_lsn).filter(|_| froze)
+                }
+            };
+            if let Some(lsn) = at_lsn {
+                let res: Result<u64, _> = self.flush_frozen_layers(lsn);
+                if let Err(e) = res {
+                    tracing::info!("failed to flush frozen layer after background freeze: {e:#}");
                }
            }
-            self.flush_frozen_layers();
        }
    }

@@ -2035,11 +2099,11 @@ impl Timeline {
            true
        } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() {
            info!(
-                    "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
-                    projected_lsn,
-                    layer_size,
-                    opened_at.elapsed()
-                );
+                "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})",
+                projected_lsn,
+                layer_size,
+                opened_at.elapsed()
+            );

            true
        } else {
@@ -2053,6 +2117,24 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

 // Private functions
 impl Timeline {
+    pub(crate) fn get_lsn_lease_length(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .lsn_lease_length
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+    }
+
+    // TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
+    #[allow(unused)]
+    pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .lsn_lease_length_for_ts
+            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
+    }
+
    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2322,6 +2404,9 @@ impl Timeline {
                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),

                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
+
+                #[cfg(test)]
+                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2380,7 +2465,7 @@ impl Timeline {
                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..}));
+                assert!(matches!(*flush_loop_state, FlushLoopState::Running{..}));
                *flush_loop_state  = FlushLoopState::Exited;
                Ok(())
            }
@@ -3643,28 +3728,21 @@ impl Timeline {
        self.last_record_lsn.advance(new_lsn);
    }

-    /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn
-    /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive).
-    async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn {
-        // Freeze the current open in-memory layer. It will be written to disk on next
-        // iteration.
-
-        let _write_guard = if write_lock_held {
-            None
-        } else {
-            Some(self.write_lock.lock().await)
+    async fn freeze_inmem_layer_at(
+        &self,
+        at: Lsn,
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) {
+        let frozen = {
+            let mut guard = self.layers.write().await;
+            guard
+                .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock)
+                .await
        };
-
-        let to_lsn = self.get_last_record_lsn();
-        self.freeze_inmem_layer_at(to_lsn).await;
-        to_lsn
-    }
-
-    async fn freeze_inmem_layer_at(&self, at: Lsn) {
-        let mut guard = self.layers.write().await;
-        guard
-            .try_freeze_in_memory_layer(at, &self.last_freeze_at)
-            .await;
+        if frozen {
+            let now = Instant::now();
+            *(self.last_freeze_ts.write().unwrap()) = now;
+        }
    }

    /// Layer flusher task's main loop.
@@ -3758,18 +3836,14 @@ impl Timeline {
        }
    }

-    /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk.
-    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`].
+    /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk.
+    /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`].
    ///
-    /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case,
-    /// it means no data will be written between the top of the highest frozen layer and to_lsn,
-    /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL.
-    async fn flush_frozen_layers_and_wait(
-        &self,
-        last_record_lsn: Lsn,
-    ) -> Result<(), FlushLayerError> {
-        let mut rx = self.layer_flush_done_tx.subscribe();
-
+    /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the
+    /// case, it means no data will be written between the top of the highest frozen layer and
+    /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data
+    /// locally for that part of the WAL.
+    fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result<u64, FlushLayerError> {
        // Increment the flush cycle counter and wake up the flush task.
        // Remember the new value, so that when we listen for the flush
        // to finish, we know when the flush that we initiated has
@@ -3784,13 +3858,18 @@ impl Timeline {
        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
            my_flush_request = *counter + 1;
            *counter = my_flush_request;
-            *lsn = std::cmp::max(last_record_lsn, *lsn);
+            *lsn = std::cmp::max(at_lsn, *lsn);
        });

+        Ok(my_flush_request)
+    }
+
+    async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> {
+        let mut rx = self.layer_flush_done_tx.subscribe();
        loop {
            {
                let (last_result_counter, last_result) = &*rx.borrow();
-                if *last_result_counter >= my_flush_request {
+                if *last_result_counter >= request {
                    if let Err(err) = last_result {
                        // We already logged the original error in
                        // flush_loop. We cannot propagate it to the caller
@@ -3817,12 +3896,9 @@ impl Timeline {
        }
    }

-    fn flush_frozen_layers(&self) {
-        self.layer_flush_start_tx.send_modify(|(counter, lsn)| {
-            *counter += 1;
-
-            *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1));
-        });
+    async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> {
+        let token = self.flush_frozen_layers(at_lsn)?;
+        self.wait_flush_completion(token).await
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
@@ -4799,7 +4875,7 @@ impl Timeline {
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> anyhow::Result<GcCutoffs> {
+    ) -> Result<GcCutoffs, PageReconstructError> {
        let _timer = self
            .metrics
            .find_gc_cutoffs_histo
@@ -4884,13 +4960,25 @@ impl Timeline {
            return Err(GcError::TimelineCancelled);
        }

-        let (horizon_cutoff, pitr_cutoff, retain_lsns) = {
+        let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
            let gc_info = self.gc_info.read().unwrap();

            let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
            let pitr_cutoff = gc_info.cutoffs.pitr;
            let retain_lsns = gc_info.retain_lsns.clone();
-            (horizon_cutoff, pitr_cutoff, retain_lsns)
+
+            // Gets the maximum LSN that holds the valid lease.
+            //
+            // Caveat: `refresh_gc_info` is in charged of updating the lease map.
+            // Here, we do not check for stale leases again.
+            let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
+
+            (
+                horizon_cutoff,
+                pitr_cutoff,
+                retain_lsns,
+                max_lsn_with_valid_lease,
+            )
        };

        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
@@ -4921,7 +5009,13 @@ impl Timeline {
            .set(Lsn::INVALID.0 as i64);

        let res = self
-            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+            .gc_timeline(
+                horizon_cutoff,
+                pitr_cutoff,
+                retain_lsns,
+                max_lsn_with_valid_lease,
+                new_gc_cutoff,
+            )
            .instrument(
                info_span!("gc_timeline", timeline_id = %self.timeline_id, cutoff = %new_gc_cutoff),
            )
@@ -4938,6 +5032,7 @@ impl Timeline {
        horizon_cutoff: Lsn,
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
+        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
    ) -> Result<GcResult, GcError> {
        // FIXME: if there is an ongoing detach_from_ancestor, we should just skip gc
@@ -4986,7 +5081,8 @@ impl Timeline {
        // 1. it is older than cutoff LSN;
        // 2. it is older than PITR interval;
        // 3. it doesn't need to be retained for 'retain_lsns';
-        // 4. newer on-disk image layers cover the layer's whole key range
+        // 4. it does not need to be kept for LSNs holding valid leases.
+        // 5. newer on-disk image layers cover the layer's whole key range
        //
        // TODO holding a write lock is too agressive and avoidable
        let mut guard = self.layers.write().await;
@@ -5037,7 +5133,21 @@ impl Timeline {
                }
            }

-            // 4. Is there a later on-disk layer for this relation?
+            // 4. Is there a valid lease that requires us to keep this layer?
+            if let Some(lsn) = &max_lsn_with_valid_lease {
+                // keep if layer start <= any of the lease
+                if &l.get_lsn_range().start <= lsn {
+                    debug!(
+                        "keeping {} because there is a valid lease preventing GC at {}",
+                        l.layer_name(),
+                        lsn,
+                    );
+                    result.layers_needed_by_leases += 1;
+                    continue 'outer;
+                }
+            }
+
+            // 5. Is there a later on-disk layer for this relation?
            //
            // The end-LSN is exclusive, while disk_consistent_lsn is
            // inclusive. For example, if disk_consistent_lsn is 100, it is
@@ -5415,6 +5525,11 @@ impl Timeline {
        self.last_record_lsn.advance(new_lsn);
    }

+    #[cfg(test)]
+    pub(super) fn force_set_disk_consistent_lsn(&self, new_value: Lsn) {
+        self.disk_consistent_lsn.store(new_value);
+    }
+
    /// Force create an image layer and place it into the layer map.
    ///
    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
@@ -5536,10 +5651,33 @@ impl Timeline {
        all_data.sort();
        Ok(all_data)
    }
+
+    /// Get all historic layer descriptors in the layer map
+    #[cfg(test)]
+    pub(crate) async fn inspect_historic_layers(
+        self: &Arc<Timeline>,
+    ) -> anyhow::Result<Vec<super::storage_layer::PersistentLayerKey>> {
+        let mut layers = Vec::new();
+        let guard = self.layers.read().await;
+        for layer in guard.layer_map().iter_historic_layers() {
+            layers.push(layer.key());
+        }
+        Ok(layers)
+    }
+
+    #[cfg(test)]
+    pub(crate) fn add_extra_test_dense_keyspace(&self, ks: KeySpace) {
+        let mut keyspace = self.extra_test_dense_keyspace.load().as_ref().clone();
+        keyspace.merge(&ks);
+        self.extra_test_dense_keyspace.store(Arc::new(keyspace));
+    }
 }

 type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

+/// Tracking writes ingestion does to a particular in-memory layer.
+///
+/// Cleared upon freezing a layer.
 struct TimelineWriterState {
    open_layer: Arc<InMemoryLayer>,
    current_size: u64,
@@ -5580,12 +5718,6 @@ impl Deref for TimelineWriter<'_> {
    }
 }

-impl Drop for TimelineWriter<'_> {
-    fn drop(&mut self) {
-        self.write_guard.take();
-    }
-}
-
 #[derive(PartialEq)]
 enum OpenLayerAction {
    Roll,
@@ -5668,16 +5800,15 @@ impl<'a> TimelineWriter<'a> {
    }

    async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> {
-        assert!(self.write_guard.is_some());
-
-        self.tl.freeze_inmem_layer_at(freeze_at).await;
-
-        let now = Instant::now();
-        *(self.last_freeze_ts.write().unwrap()) = now;
-
-        self.tl.flush_frozen_layers();
-
        let current_size = self.write_guard.as_ref().unwrap().current_size;
+
+        // self.write_guard will be taken by the freezing
+        self.tl
+            .freeze_inmem_layer_at(freeze_at, &mut self.write_guard)
+            .await;
+
+        self.tl.flush_frozen_layers(freeze_at)?;
+
        if current_size >= self.get_checkpoint_distance() * 2 {
            warn!("Flushed oversized open layer with size {}", current_size)
        }
@@ -5691,9 +5822,27 @@ impl<'a> TimelineWriter<'a> {
            return OpenLayerAction::Open;
        };

+        #[cfg(feature = "testing")]
+        if state.cached_last_freeze_at < self.tl.last_freeze_at.load() {
+            // this check and assertion are not really needed because
+            // LayerManager::try_freeze_in_memory_layer will always clear out the
+            // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there
+            // is no TimelineWriterState.
+            assert!(
+                state.open_layer.end_lsn.get().is_some(),
+                "our open_layer must be outdated"
+            );
+
+            // this would be a memory leak waiting to happen because the in-memory layer always has
+            // an index
+            panic!("BUG: TimelineWriterState held on to frozen in-memory layer.");
+        }
+
        if state.prev_lsn == Some(lsn) {
-            // Rolling mid LSN is not supported by downstream code.
+            // Rolling mid LSN is not supported by [downstream code].
            // Hence, only roll at LSN boundaries.
+            //
+            // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422
            return OpenLayerAction::None;
        }

--- a/pageserver/src/tenant/timeline/analysis.rs
+++ b/pageserver/src/tenant/timeline/analysis.rs
@@ -0,0 +1,90 @@
+use std::{collections::BTreeSet, ops::Range};
+
+use utils::lsn::Lsn;
+
+use super::Timeline;
+
+#[derive(serde::Serialize)]
+pub(crate) struct RangeAnalysis {
+    start: String,
+    end: String,
+    has_image: bool,
+    num_of_deltas_above_image: usize,
+    total_num_of_deltas: usize,
+}
+
+impl Timeline {
+    pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
+        // First, collect all split points of the layers.
+        let mut split_points = BTreeSet::new();
+        let mut delta_ranges = Vec::new();
+        let mut image_ranges = Vec::new();
+
+        let all_layer_files = {
+            let guard = self.layers.read().await;
+            guard.all_persistent_layers()
+        };
+        let lsn = self.get_last_record_lsn();
+
+        for key in all_layer_files {
+            split_points.insert(key.key_range.start);
+            split_points.insert(key.key_range.end);
+            if key.is_delta {
+                delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
+            } else {
+                image_ranges.push((key.key_range.clone(), key.lsn_range.start));
+            }
+        }
+
+        // For each split range, compute the estimated read amplification.
+        let split_points = split_points.into_iter().collect::<Vec<_>>();
+
+        let mut result = Vec::new();
+
+        for i in 0..(split_points.len() - 1) {
+            let start = split_points[i];
+            let end = split_points[i + 1];
+            // Find the latest image layer that contains the information.
+            let mut maybe_image_layers = image_ranges
+                .iter()
+                // We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
+                .filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
+                .cloned()
+                .collect::<Vec<_>>();
+            maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
+            let image_layer = maybe_image_layers.last().cloned();
+            let lsn_filter_start = image_layer
+                .as_ref()
+                .map(|(_, lsn)| *lsn)
+                .unwrap_or(Lsn::INVALID);
+
+            fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
+                !(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
+            }
+
+            let maybe_delta_layers = delta_ranges
+                .iter()
+                .filter(|(key_range, lsn_range)| {
+                    key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
+                })
+                .cloned()
+                .collect::<Vec<_>>();
+
+            let pitr_delta_layers = delta_ranges
+                .iter()
+                .filter(|(key_range, _)| key_range.contains(&start))
+                .cloned()
+                .collect::<Vec<_>>();
+
+            result.push(RangeAnalysis {
+                start: start.to_string(),
+                end: end.to_string(),
+                has_image: image_layer.is_some(),
+                num_of_deltas_above_image: maybe_delta_layers.len(),
+                total_num_of_deltas: pitr_delta_layers.len(),
+            });
+        }
+
+        result
+    }
+}
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -952,6 +952,178 @@ impl Timeline {
        adaptor.flush_updates().await?;
        Ok(())
    }
+
+    /// An experimental compaction building block that combines compaction with garbage collection.
+    ///
+    /// The current implementation picks all delta + image layers that are below or intersecting with
+    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
+    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
+    /// and create delta layers with all deltas >= gc horizon.
+    #[cfg(test)]
+    pub(crate) async fn compact_with_gc(
+        self: &Arc<Self>,
+        _cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        use crate::tenant::storage_layer::ValueReconstructState;
+        // Step 0: pick all delta layers + image layers below/intersect with the GC horizon.
+        // The layer selection has the following properties:
+        // 1. If a layer is in the selection, all layers below it are in the selection.
+        // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
+        let (layer_selection, gc_cutoff) = {
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map();
+            let gc_info = self.gc_info.read().unwrap();
+            let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
+            let mut selected_layers = Vec::new();
+            // TODO: consider retain_lsns
+            drop(gc_info);
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().start <= gc_cutoff {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            (selected_layers, gc_cutoff)
+        };
+        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
+        let mut all_key_values = Vec::new();
+        for layer in &layer_selection {
+            all_key_values.extend(layer.load_key_values(ctx).await?);
+        }
+        // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
+        // image layers, make image appear later than delta.
+        struct ValueWrapper<'a>(&'a crate::repository::Value);
+        impl Ord for ValueWrapper<'_> {
+            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+                use crate::repository::Value;
+                use std::cmp::Ordering;
+                match (self.0, other.0) {
+                    (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater,
+                    (Value::WalRecord(_), Value::Image(_)) => Ordering::Less,
+                    _ => Ordering::Equal,
+                }
+            }
+        }
+        impl PartialOrd for ValueWrapper<'_> {
+            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+                Some(self.cmp(other))
+            }
+        }
+        impl PartialEq for ValueWrapper<'_> {
+            fn eq(&self, other: &Self) -> bool {
+                self.cmp(other) == std::cmp::Ordering::Equal
+            }
+        }
+        impl Eq for ValueWrapper<'_> {}
+        all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
+            (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
+        });
+        let max_lsn = all_key_values
+            .iter()
+            .map(|(_, lsn, _)| lsn)
+            .max()
+            .copied()
+            .unwrap()
+            + 1;
+        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
+        // Data of the same key.
+        let mut accumulated_values = Vec::new();
+        let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
+
+        /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
+        async fn flush_accumulated_states(
+            tline: &Arc<Timeline>,
+            key: Key,
+            accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
+            horizon: Lsn,
+        ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
+            let mut base_image = None;
+            let mut keys_above_horizon = Vec::new();
+            let mut delta_above_base_image = Vec::new();
+            // We have a list of deltas/images. We want to create image layers while collect garbages.
+            for (key, lsn, val) in accumulated_values.iter().rev() {
+                if *lsn > horizon {
+                    keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both
+                } else if *lsn <= horizon {
+                    match val {
+                        crate::repository::Value::Image(image) => {
+                            if lsn <= &horizon {
+                                base_image = Some((*lsn, image.clone()));
+                                break;
+                            }
+                        }
+                        crate::repository::Value::WalRecord(wal) => {
+                            delta_above_base_image.push((*lsn, wal.clone()));
+                        }
+                    }
+                }
+            }
+            delta_above_base_image.reverse();
+            keys_above_horizon.reverse();
+            let state = ValueReconstructState {
+                img: base_image,
+                records: delta_above_base_image,
+            };
+            let img = tline.reconstruct_value(key, horizon, state).await?;
+            Ok((keys_above_horizon, img))
+        }
+
+        let mut delta_layer_writer = DeltaLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            all_key_values.first().unwrap().0,
+            gc_cutoff..max_lsn, // TODO: off by one?
+            ctx,
+        )
+        .await?;
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
+            gc_cutoff,
+            ctx,
+        )
+        .await?;
+
+        for item @ (key, _, _) in &all_key_values {
+            if &last_key == key {
+                accumulated_values.push(item);
+            } else {
+                let (deltas, image) =
+                    flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
+                        .await?;
+                image_layer_writer.put_image(last_key, image, ctx).await?;
+                for (key, lsn, val) in deltas {
+                    delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+                }
+                accumulated_values.clear();
+                accumulated_values.push(item);
+                last_key = *key;
+            }
+        }
+        let (deltas, image) =
+            flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
+        image_layer_writer.put_image(last_key, image, ctx).await?;
+        for (key, lsn, val) in deltas {
+            delta_layer_writer.put_value(key, lsn, val, ctx).await?;
+        }
+        accumulated_values.clear();
+        // TODO: split layers
+        let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?;
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
+        // Step 3: Place back to the layer map.
+        {
+            let mut guard = self.layers.write().await;
+            guard.finish_gc_compaction(
+                &layer_selection,
+                &[delta_layer.clone(), image_layer.clone()],
+                &self.metrics,
+            )
+        };
+        Ok(())
+    }
 }

 struct TimelineAdaptor {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -11,7 +11,6 @@ use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};

 use crate::{
    config::PageServerConf,
-    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
        metadata::TimelineMetadata,
@@ -263,7 +262,6 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
-        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -274,7 +272,6 @@ impl DeleteTimelineFlow {
                None, // Ancestor is not needed for deletion.
                TimelineResources {
                    remote_client,
-                    deletion_queue_client,
                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure, Context, Result};
+use itertools::Itertools;
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
@@ -20,6 +21,8 @@ use crate::{
    },
 };

+use super::TimelineWriterState;
+
 /// Provides semantic APIs to manipulate the layer map.
 #[derive(Default)]
 pub(crate) struct LayerManager {
@@ -119,18 +122,20 @@ impl LayerManager {
        Ok(layer)
    }

-    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub(crate) async fn try_freeze_in_memory_layer(
+    /// Tries to freeze an open layer and also manages clearing the TimelineWriterState.
+    ///
+    /// Returns true if anything was frozen.
+    pub(super) async fn try_freeze_in_memory_layer(
        &mut self,
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
-    ) {
+        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
+    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);

-        if let Some(open_layer) = &self.layer_map.open_layer {
+        let froze = if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
-            // Does this layer need freezing?
            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
@@ -138,11 +143,25 @@ impl LayerManager {
            self.layer_map.frozen_layers.push_back(open_layer_rc);
            self.layer_map.open_layer = None;
            self.layer_map.next_open_layer_at = Some(end_lsn);
-        }
+
+            true
+        } else {
+            false
+        };

        // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this
        // accounts for regions in the LSN range where we might have ingested no data due to sharding.
        last_freeze_at.store(end_lsn);
+
+        // the writer state must no longer have a reference to the frozen layer
+        let taken = write_lock.take();
+        assert_eq!(
+            froze,
+            taken.is_some(),
+            "should only had frozen a layer when TimelineWriterState existed"
+        );
+
+        froze
    }

    /// Add image layers to the layer map, called from `create_image_layers`.
@@ -207,6 +226,18 @@ impl LayerManager {
        updates.flush();
    }

+    /// Called when a GC-compaction is completed.
+    #[cfg(test)]
+    pub(crate) fn finish_gc_compaction(
+        &mut self,
+        compact_from: &[Layer],
+        compact_to: &[ResidentLayer],
+        metrics: &TimelineMetrics,
+    ) {
+        // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification.
+        self.finish_compact_l0(compact_from, compact_to, metrics)
+    }
+
    /// Called when compaction is completed.
    pub(crate) fn rewrite_layers(
        &mut self,
@@ -308,6 +339,10 @@ impl LayerManager {
    pub(crate) fn contains(&self, layer: &Layer) -> bool {
        self.layer_fmgr.contains(layer)
    }
+
+    pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
+        self.layer_fmgr.0.keys().cloned().collect_vec()
+    }
 }

 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -49,6 +49,19 @@ pub enum NeonWalRecord {
        file_path: String,
        content: Option<Bytes>,
    },
+
+    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
+    #[cfg(test)]
+    Test {
+        /// Append a string to the image.
+        append: String,
+        /// Clear the image before appending.
+        clear: bool,
+        /// Treat this record as an init record. `clear` should be set to true if this field is set
+        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
+        /// its references in `timeline.rs`.
+        will_init: bool,
+    },
 }

 impl NeonWalRecord {
@@ -58,11 +71,39 @@ impl NeonWalRecord {
        // If you change this function, you'll also need to change ValueBytes::will_init
        match self {
            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-
+            #[cfg(test)]
+            NeonWalRecord::Test { will_init, .. } => *will_init,
            // None of the special neon record types currently initialize the page
            _ => false,
        }
    }
+
+    #[cfg(test)]
+    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn wal_clear() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: false,
+        }
+    }
+
+    #[cfg(test)]
+    pub(crate) fn wal_init() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: true,
+        }
+    }
 }

 /// DecodedBkpBlock represents per-page data contained in a WAL record.
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -244,6 +244,20 @@ pub(crate) fn apply_in_neon(
            let mut writer = page.writer();
            dir.ser_into(&mut writer)?;
        }
+        #[cfg(test)]
+        NeonWalRecord::Test {
+            append,
+            clear,
+            will_init,
+        } => {
+            if *will_init {
+                assert!(*clear, "init record must be clear to ensure correctness");
+            }
+            if *clear {
+                page.clear();
+            }
+            page.put_slice(append.as_bytes());
+        }
    }
    Ok(())
 }
--- a/patches/pg_anon.patch
+++ b/patches/pg_anon.patch
@@ -0,0 +1,223 @@
+commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri May 31 06:34:26 2024 +0000
+
+    These alternative expected files were added to consider the neon features
+
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+new file mode 100644
+index 0000000..2539cfd
+--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
+@@ -0,0 +1,101 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE:  installing required extension "pgcrypto"
+SELECT anon.init();
+ init 
+------
+ t
+(1 row)
+
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE mallory_the_masked_user;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+  PERFORM anon.init();
+  EXCEPTION WHEN insufficient_privilege
+  THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE:  insufficient_privilege
+-- SHOULD FAIL
+DO $$
+BEGIN
+  PERFORM anon.anonymize_table('t1');
+  EXCEPTION WHEN insufficient_privilege
+  THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE:  insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ERROR:  Only supersusers can start the dynamic masking engine.
+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking 
+-----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT * FROM mask.t1;
+ i | t 
+---+---
+ 1 | 
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+  SELECT * FROM public.t1;
+  EXCEPTION WHEN insufficient_privilege
+  THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE:  insufficient_privilege
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR:  Only supersusers can stop the dynamic masking engine.
+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking 
+----------------------
+ t
+(1 row)
+
+SET ROLE mallory_the_masked_user;
+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR:  permission denied
+DETAIL:  The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
+diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+new file mode 100644
+index 0000000..8b090fe
+--- /dev/null
+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
+@@ -0,0 +1,104 @@
+BEGIN;
+CREATE EXTENSION anon CASCADE;
+NOTICE:  installing required extension "pgcrypto"
+SELECT anon.init();
+ init 
+------
+ t
+(1 row)
+
+CREATE ROLE oscar_the_owner;
+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
+CREATE ROLE mallory_the_masked_user;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
+--
+-- We're checking the owner's permissions
+--
+-- see
+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
+--
+SET ROLE oscar_the_owner;
+SELECT anon.pseudo_first_name(0) IS NOT NULL;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- SHOULD FAIL
+DO $$
+BEGIN
+  PERFORM anon.init();
+  EXCEPTION WHEN insufficient_privilege
+  THEN RAISE NOTICE 'insufficient_privilege';
+END$$;
+NOTICE:  insufficient_privilege
+CREATE TABLE t1(i INT);
+ALTER TABLE t1 ADD COLUMN t TEXT;
+SECURITY LABEL FOR anon ON COLUMN t1.t
+IS 'MASKED WITH VALUE NULL';
+INSERT INTO t1 VALUES (1,'test');
+SELECT anon.anonymize_table('t1');
+ anonymize_table 
+-----------------
+ t
+(1 row)
+
+SELECT * FROM t1;
+ i | t 
+---+---
+ 1 | 
+(1 row)
+
+UPDATE t1 SET t='test' WHERE i=1;
+-- SHOULD FAIL
+SAVEPOINT fail_start_engine;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking 
+-----------------------
+ t
+(1 row)
+
+ROLLBACK TO fail_start_engine;
+RESET ROLE;
+SELECT anon.start_dynamic_masking();
+ start_dynamic_masking 
+-----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+SELECT * FROM t1;
+ i |  t   
+---+------
+ 1 | test
+(1 row)
+
+--SELECT * FROM mask.t1;
+-- SHOULD FAIL
+SAVEPOINT fail_stop_engine;
+SELECT anon.stop_dynamic_masking();
+ERROR:  permission denied for schema mask
+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
+SQL statement "SELECT anon.mask_drop_view(oid)
+  FROM pg_catalog.pg_class
+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
+  AND relkind IN ('r','p','f')"
+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
+ROLLBACK TO fail_stop_engine;
+RESET ROLE;
+SELECT anon.stop_dynamic_masking();
+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
+ stop_dynamic_masking 
+----------------------
+ t
+(1 row)
+
+SET ROLE oscar_the_owner;
+-- SHOULD FAIL
+SAVEPOINT fail_seclabel_on_role;
+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
+ERROR:  permission denied
+DETAIL:  The current user must have the CREATEROLE attribute.
+ROLLBACK TO fail_seclabel_on_role;
+ROLLBACK;
--- a/patches/pg_cron.patch
+++ b/patches/pg_cron.patch
@@ -0,0 +1,19 @@
+commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Fri Jun 7 19:23:42 2024 +0000
+
+    Disable REGRESS_OPTIONS causing initdb
+
+diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile
+index 053314c..fbd5fb5 100644
+--- a/ext-src/pg_cron-src/Makefile
+++ b/ext-src/pg_cron-src/Makefile
+@@ -5,7 +5,7 @@ EXTENSION = pg_cron
+ DATA_built = $(EXTENSION)--1.0.sql
+ DATA = $(wildcard $(EXTENSION)--*--*.sql)
+ 
+-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
+#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check
+ REGRESS = pg_cron-test 
+ 
+ # compilation configuration
--- a/patches/pg_hintplan.patch
+++ b/patches/pg_hintplan.patch
@@ -0,0 +1,39 @@
+commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
+Author: Alexey Masterov <alexeymasterov@neon.tech>
+Date:   Thu Jun 6 08:02:42 2024 +0000
+
+    Patch expected files to consider Neon's log messages
+
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
+index da723b8..f8d0102 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+index d372459..6282afe 100644
+--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
+LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -0,0 +1,62 @@
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index dcfb2bd..d5189ee 100644
+--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
+@@ -860,9 +860,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1117,12 +1125,38 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
+-	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM)
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
+	if (RelationNeedsWAL(index) || forkNum == INIT_FORKNUM) {
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocksInFork(index, forkNum), true);
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+									   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
+#ifdef NEON_SMGR
+	smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+ 
+ 	FreeBuildState(buildstate);
+ }
--- a/pgxn/.dir-locals.el
+++ b/pgxn/.dir-locals.el
@@ -0,0 +1,19 @@
+;; see also src/tools/editors/emacs.samples for more complete settings
+
+((c-mode . ((c-basic-offset . 4)
+            (c-file-style . "bsd")
+            (fill-column . 78)
+            (indent-tabs-mode . t)
+            (tab-width . 4)))
+ (nxml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil)))
+ (perl-mode . ((perl-indent-level . 4)
+               (perl-continued-statement-offset . 2)
+               (perl-continued-brace-offset . -2)
+               (perl-brace-offset . 0)
+               (perl-brace-imaginary-offset . 0)
+               (perl-label-offset . -2)
+               (indent-tabs-mode . t)
+               (tab-width . 4)))
+ (sgml-mode . ((fill-column . 78)
+               (indent-tabs-mode . nil))))
--- a/pgxn/.editorconfig
+++ b/pgxn/.editorconfig
@@ -0,0 +1,14 @@
+root = true
+
+[*.{c,h,l,y,pl,pm}]
+indent_style = tab
+indent_size = tab
+tab_width = 4
+
+[*.{sgml,xml}]
+indent_style = space
+indent_size = 1
+
+[*.xsl]
+indent_style = space
+indent_size = 2
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -381,6 +381,15 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
 		shard->last_reconnect_time = now;

+		/*
+		 * Make sure we don't do exponential backoff with a constant multiplier
+		 * of 0 us, as that doesn't really do much for timeouts...
+		 *
+		 * cf. https://github.com/neondatabase/neon/issues/7897
+		 */
+		if (shard->delay_us == 0)
+			shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
+
 		/*
 		 * If we did other tasks between reconnect attempts, then we won't
 		 * need to wait as long as a full delay.
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -19,6 +19,7 @@
 #include "catalog/pg_type.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
+#include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/procsignal.h"
@@ -280,6 +281,7 @@ _PG_init(void)
 	pg_init_libpagestore();
 	pg_init_walproposer();
        WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
+	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitLogicalReplicationMonitor();

--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -295,18 +295,10 @@ extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
-extern bool set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size, BlockNumber* old_size);
+extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);

-extern bool start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize);
-extern bool is_unlogged_build_extend(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize);
-extern bool is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber* relsize);
-extern bool stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum);
-extern void resume_unlogged_build(void);
-
-
-
 /* functions for local file cache */
 #if PG_MAJORVERSION_NUM < 16
 extern void lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -10,6 +10,10 @@
 * Temporary and unlogged tables are stored locally, by md.c. The functions
 * here just pass the calls through to corresponding md.c functions.
 *
+ * Index build operations that use the buffer cache are also handled locally,
+ * just like unlogged tables. Such operations must be marked by calling
+ * smgr_start_unlogged_build() and friends.
+ *
 * In order to know what relations are permanent and which ones are not, we
 * have added a 'smgr_relpersistence' field to SmgrRelationData, and it is set
 * by smgropen() callers, when they have the relcache entry at hand.  However,
@@ -60,7 +64,6 @@
 #include "storage/fsm_internals.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
-#include "utils/rel.h"

 #include "pagestore_client.h"

@@ -97,7 +100,17 @@ const int	SmgrTrace = DEBUG5;

 page_server_api *page_server;

-const PGAlignedBlock zero_buffer;
+/* unlogged relation build states */
+typedef enum
+{
+	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
+	UNLOGGED_BUILD_PHASE_1,
+	UNLOGGED_BUILD_PHASE_2,
+	UNLOGGED_BUILD_NOT_PERMANENT
+} UnloggedBuildPhase;
+
+static SMgrRelation unlogged_build_rel = NULL;
+static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
@@ -1393,20 +1406,6 @@ PageIsEmptyHeapPage(char *buffer)
 * A page is being evicted from the shared buffer cache. Update the
 * last-written LSN of the page, and WAL-log it if needed.
 */
-static void
-unlogged_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber old_relsize, BlockNumber new_relsize)
-{
-	if (new_relsize > old_relsize)
-	{
-#if PG_MAJORVERSION_NUM < 16
-		mdextend(reln, forknum, new_relsize, (char *) zero_buffer.data, true);
-#else
-		mdzeroextend(reln, forknum, old_relsize, new_relsize - old_relsize, true);
-#endif
-	}
-}
-
-
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
@@ -1414,7 +1413,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
 #endif
 {
-	BlockNumber relsize;
 	XLogRecPtr	lsn = PageGetLSN((Page) buffer);
 	bool		log_page;

@@ -1481,7 +1479,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		#if 0
 		else if (PageIsEmptyHeapPage((Page) buffer))
 		{
 			ereport(SmgrTrace,
@@ -1490,95 +1487,34 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
 		}
-		#endif
 		else if (forknum != FSM_FORKNUM && forknum != VISIBILITYMAP_FORKNUM)
 		{
-			if (start_unlogged_build(InfoFromSMgrRel(reln), forknum, blocknum, &relsize))
-			{
-				mdcreate(reln, forknum, true);
-				if (relsize != 0)
-					unlogged_extend(reln, forknum, 0, relsize);
-				elog(SmgrTrace, "neon_wallog_page: start unlogged %u/%u/%u.%u blk %u, relsize %u",
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum, blocknum, relsize);
-			}
-			else
-				elog(SmgrTrace, "neon_wallog_page: continue unlogged %u/%u/%u.%u blk %u, relsize %u",
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum, blocknum, relsize);
-
-			if (blocknum >= relsize)
-			{
-				unlogged_extend(reln, forknum, relsize, blocknum+1);
-			}
-			mdwrite(reln, forknum, blocknum, buffer, true);
-			resume_unlogged_build();
-
-			ereport(SmgrTrace,
-					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is saved locally.",
+			/*
+			 * Its a bad sign if there is a page with zero LSN in the buffer
+			 * cache in a standby, too. However, PANICing seems like a cure
+			 * worse than the disease, as the damage has likely already been
+			 * done in the primary. So in a standby, make this an assertion,
+			 * and in a release build just LOG the error and soldier on. We
+			 * update the last-written LSN of the page with a conservative
+			 * value in that case, which is the last replayed LSN.
+			 */
+			ereport(RecoveryInProgress() ? LOG : PANIC,
+					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
 							blocknum,
 							RelFileInfoFmt(InfoFromSMgrRel(reln)),
 							forknum)));
+			Assert(false);
+
 			lsn = GetXLogReplayRecPtr(NULL); /* in standby mode, soldier on */
 		}
 	}
-	else if (lsn < FirstNormalUnloggedLSN)
-	{
-		if (start_unlogged_build(InfoFromSMgrRel(reln),forknum, blocknum, &relsize))
-		{
-			mdcreate(reln, forknum, true);
-			if (relsize != 0)
-				unlogged_extend(reln, forknum, 0, relsize);
-			elog(SmgrTrace, "neon_wallog_page: start unlogged %u/%u/%u.%u blk %u, relsize %u, LSN %X",
-				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-				 forknum, blocknum, relsize, (unsigned)lsn);
-		}
-		else
-			elog(SmgrTrace, "neon_wallog_page: continue unlogged %u/%u/%u.%u blk %u, relsize %u, LSN %X",
-				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-				 forknum, blocknum, relsize, (unsigned)lsn);
-		if (blocknum >= relsize)
-		{
-			unlogged_extend(reln, forknum, relsize, blocknum+1);
-		}
-		mdwrite(reln, forknum, blocknum, buffer, true);
-		resume_unlogged_build();
-
-		ereport(SmgrTrace,
-				(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is saved locally.",
-						blocknum,
-						RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						forknum)));
-	}
 	else
 	{
-		if (is_unlogged_build_extend(InfoFromSMgrRel(reln), forknum, blocknum, &relsize))
-		{
-			elog(SmgrTrace, "neon_wallog_page: unlogged extend %u/%u/%u.%u blk %u, relsize %u",
-				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-				 forknum, blocknum, relsize);
-			if (blocknum >= relsize)
-			{
-				unlogged_extend(reln, forknum, relsize, blocknum+1);
-			}
-			mdwrite(reln, forknum, blocknum, buffer, true);
-			ereport(SmgrTrace,
-					(errmsg(NEON_TAG "Page %u with LSN=%X/%X of relation %u/%u/%u.%u is saved locally.",
-							blocknum,
-							LSN_FORMAT_ARGS(lsn),
-							RelFileInfoFmt(InfoFromSMgrRel(reln)),
-							forknum)));
-		}
-		else
-		{
-			ereport(SmgrTrace,
-					(errmsg(NEON_TAG "Page %u of relation %u/%u/%u.%u is already wal-logged at lsn=%X/%X",
+		ereport(SmgrTrace,
+				(errmsg(NEON_TAG "Evicting page %u of relation %u/%u/%u.%u with lsn=%X/%X",
 						blocknum,
 						RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						forknum, LSN_FORMAT_ARGS(lsn)
-					)));
-		}
-		resume_unlogged_build();
+						forknum, LSN_FORMAT_ARGS(lsn))));
 	}

 	/*
@@ -1588,27 +1524,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 	SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
 }

-/*
- * Check if unlogged build is in progress for specified relation
- * and stop it if so. It is used as callback for log_newpage_range( function
- * which is called at the end of unlogged build.
- */
-static void
-neon_log_newpage_range_callback(Relation rel, ForkNumber forknum)
-{
-	SMgrRelation smgr = RelationGetSmgr(rel);
-	if (stop_unlogged_build(InfoFromSMgrRel(smgr), forknum))
-	{
-		mdclose(smgr, forknum);
-		/* use isRedo == true, so that we drop it immediately */
-		mdunlink(InfoBFromSMgrRel(smgr), forknum, true);
-		resume_unlogged_build(); /* doesn't actually resume build, just release lock */
-	}
-}
-
-
-
-
 /*
 *	neon_init() -- Initialize private state
 */
@@ -1644,8 +1559,6 @@ neon_init(void)
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;

-	log_newpage_range_callback = neon_log_newpage_range_callback;
-
 #ifdef DEBUG_COMPARE_LOCAL
 	mdinit();
 #endif
@@ -2108,7 +2021,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 						   &reln->smgr_cached_nblocks[forkNum]);
 	}
 	else
-		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0, NULL);
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2168,7 +2081,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 #endif
 {
 	XLogRecPtr	lsn;
-	BlockNumber old_relsize;
 	BlockNumber n_blocks = 0;

 	switch (reln->smgr_relpersistence)
@@ -2220,14 +2132,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
-
-	if (set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1, &old_relsize))
-	{
-		unlogged_extend(reln, forkNum, old_relsize, blkno + 1);
-		resume_unlogged_build();
-	}
-	else /* Do not store pages during unlogedbuild in LFC two avoid double local storage consumption */
-		lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
 	neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -2235,6 +2140,8 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

+	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdextend(reln, forkNum, blkno, buffer, skipFsync);
@@ -2260,10 +2167,9 @@ void
 neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 				int nblocks, bool skipFsync)
 {
-	BlockNumber old_relsize;
-	BlockNumber	remblocks = nblocks;
+	const PGAlignedBlock buffer = {0};
+	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;
-	bool unlogged = false;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2312,29 +2218,8 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

-	if (set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum + nblocks, &old_relsize))
-	{
-		unlogged_extend(reln, forkNum, old_relsize, blocknum + nblocks);
-		resume_unlogged_build();
-		unlogged = true;
-	}
-
-	if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
-	{
-		/* ensure we have enough xlog buffers to log max-sized records */
-		XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
-	}
-	else
-	{
-		/*
-		 * smgr_extend is often called with an all-zeroes page, so
-		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-		 * later, after it has been initialized with the real page contents, and
-		 * it is eventually evicted from the buffer cache. But we need a valid LSN
-		 * to the relation metadata update now.
-		 */
-		lsn = GetXLogInsertRecPtr();
-	}
+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);

 	/*
 	 * Iterate over all the pages. They are collected into batches of
@@ -2345,20 +2230,17 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

-		if (forkNum != MAIN_FORKNUM) /* no need to wal-log zero pages except VM/FSM forks  */
-		{
-			XLogBeginInsert();
+		XLogBeginInsert();

-			for (int i = 0; i < count; i++)
-				XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
-								  (char *) zero_buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+		for (int i = 0; i < count; i++)
+			XLogRegisterBlock(i, &InfoFromSMgrRel(reln), forkNum, blocknum + i,
+							  (char *) buffer.data, REGBUF_FORCE_IMAGE | REGBUF_STANDARD);
+
+		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);

-			lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
-		}
 		for (int i = 0; i < count; i++)
 		{
-			if (!unlogged) /* Do not store pages during unlogedbuild in LFC two avoid double local storage consumption */
-				lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, zero_buffer.data);
+			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
 			SetLastWrittenLSNForBlock(lsn, InfoFromSMgrRel(reln), forkNum,
 									  blocknum + i);
 		}
@@ -2370,6 +2252,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	Assert(lsn != 0);

 	SetLastWrittenLSNForRelation(lsn, InfoFromSMgrRel(reln), forkNum);
+	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
 }
 #endif

@@ -2636,7 +2519,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif
 {
 	neon_request_lsns request_lsns;
-	BlockNumber relsize;

 	switch (reln->smgr_relpersistence)
 	{
@@ -2655,33 +2537,15 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (is_unlogged_build(InfoFromSMgrRel(reln), forkNum, &relsize))
+	/* Try to read from local file cache */
+	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 	{
-		if (blkno >= relsize)
-		{
-			elog(SmgrTrace, "Get empty local page %d of relation %u/%u/%u.%u",
-				 blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum);
-			memset(buffer, 0, BLCKSZ);
-		}
-		else
-		{
-			elog(SmgrTrace, "Read local page %d of relation %u/%u/%u.%u",
-				 blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum);
-			mdread(reln, forkNum, blkno, buffer);
-		}
-		resume_unlogged_build();
+		return;
 	}
-	else
-	{
-		/* Try to read from local file cache */
-		if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
-		{
-			return;
-		}

-		request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
-		neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-	}
+	request_lsns = neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno);
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -2791,36 +2655,24 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *bu
 neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 #endif
 {
-	BlockNumber relsize;
 	XLogRecPtr	lsn;
-	bool unlogged = false;

 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
-			if (is_unlogged_build_extend(InfoFromSMgrRel(reln), forknum, blocknum, &relsize))
+			/* This is a bit tricky. Check if the relation exists locally */
+			if (mdexists(reln, forknum))
 			{
-				if (blocknum >= relsize)
-				{
-					unlogged_extend(reln, forknum, relsize, blocknum+1);
-				}
-				unlogged = true;
-				elog(SmgrTrace, "neon_write: extend %u/%u/%u.%u blk %u, relsize %u",
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum, blocknum, relsize);
-			} else {
-				unlogged = mdexists(reln, forknum);
-			}
-			if (unlogged)
-			{
-				elog(SmgrTrace, "neon_write: mdwrite %u/%u/%u.%u blk %u",
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum, blocknum);
+				/* It exists locally. Guess it's unlogged then. */
 				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
-			}
-			resume_unlogged_build();
-			if (unlogged)
-			{
+
+				/*
+				 * We could set relpersistence now that we have determined
+				 * that it's local. But we don't dare to do it, because that
+				 * would immediately allow reads as well, which shouldn't
+				 * happen. We could cache it with a different 'relpersistence'
+				 * value, but this isn't performance critical.
+				 */
 				return;
 			}
 			break;
@@ -3012,7 +2864,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks, NULL);
+	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);

 	/*
 	 * Truncating a relation drops all its buffers from the buffer cache
@@ -3068,13 +2920,6 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 			break;

 		case RELPERSISTENCE_PERMANENT:
-			if (stop_unlogged_build(InfoFromSMgrRel(reln), forknum))
-			{
-				mdclose(reln, forknum);
-				/* use isRedo == true, so that we drop it immediately */
-				mdunlink(InfoBFromSMgrRel(reln), forknum, true);
-				resume_unlogged_build(); /* doesn't actually resume build, just release lock */
-			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -3094,6 +2939,150 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 #endif
 }

+/*
+ * neon_start_unlogged_build() -- Starting build operation on a rel.
+ *
+ * Some indexes are built in two phases, by first populating the table with
+ * regular inserts, using the shared buffer cache but skipping WAL-logging,
+ * and WAL-logging the whole relation after it's done. Neon relies on the
+ * WAL to reconstruct pages, so we cannot use the page server in the
+ * first phase when the changes are not logged.
+ */
+static void
+neon_start_unlogged_build(SMgrRelation reln)
+{
+	/*
+	 * Currently, there can be only one unlogged relation build operation in
+	 * progress at a time. That's enough for the current usage.
+	 */
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+		neon_log(ERROR, "unlogged relation build is already in progress");
+	Assert(unlogged_build_rel == NULL);
+
+	ereport(SmgrTrace,
+			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
+					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgr_start_unlogged_build() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			unlogged_build_rel = reln;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	if (smgrnblocks(reln, MAIN_FORKNUM) != 0)
+		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
+
+	unlogged_build_rel = reln;
+	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
+
+	/* Make the relation look like it's unlogged */
+	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
+
+	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
+	 * FIXME: should we pass isRedo true to create the tablespace dir if it
+	 * doesn't exist? Is it needed?
+	 */
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
+}
+
+/*
+ * neon_finish_unlogged_build_phase_1()
+ *
+ * Call this after you have finished populating a relation in unlogged mode,
+ * before you start WAL-logging it.
+ */
+static void
+neon_finish_unlogged_build_phase_1(SMgrRelation reln)
+{
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
+					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+
+	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
+		return;
+
+	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
+	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+}
+
+/*
+ * neon_end_unlogged_build() -- Finish an unlogged rel build.
+ *
+ * Call this after you have finished WAL-logging an relation that was
+ * first populated without WAL-logging.
+ *
+ * This removes the local copy of the rel, since it's now been fully
+ * WAL-logged and is present in the page server.
+ */
+static void
+neon_end_unlogged_build(SMgrRelation reln)
+{
+	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
+
+	Assert(unlogged_build_rel == reln);
+
+	ereport(SmgrTrace,
+			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
+					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
+
+	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
+	{
+		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
+		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
+
+		/* Make the relation look permanent again */
+		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
+
+		/* Remove local copy */
+		rinfob = InfoBFromSMgrRel(reln);
+		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			neon_log(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
+				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
+				 forknum);
+
+			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+			mdclose(reln, forknum);
+			/* use isRedo == true, so that we drop it immediately */
+			mdunlink(rinfob, forknum, true);
+		}
+	}
+
+	unlogged_build_rel = NULL;
+	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+}
+
 #define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)

 static int
@@ -3123,12 +3112,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 		request_lsn = UINT64_MAX;

 	/*
-	 * GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
+	 * GetRedoStartLsn() returns LSN of the basebackup. We know that the SLRU
 	 * segment has not changed since the basebackup, because in order to
 	 * modify it, we would have had to download it already. And once
 	 * downloaded, we never evict SLRU segments from local disk.
 	 */
-	not_modified_since = GetRedoStartLsn();
+	not_modified_since = nm_adjust_lsn(GetRedoStartLsn());

 	SlruKind kind;

@@ -3187,6 +3176,40 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	return n_blocks;
 }

+static void
+AtEOXact_neon(XactEvent event, void *arg)
+{
+	switch (event)
+	{
+		case XACT_EVENT_ABORT:
+		case XACT_EVENT_PARALLEL_ABORT:
+
+			/*
+			 * Forget about any build we might have had in progress. The local
+			 * file will be unlinked by smgrDoPendingDeletes()
+			 */
+			unlogged_build_rel = NULL;
+			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+			break;
+
+		case XACT_EVENT_COMMIT:
+		case XACT_EVENT_PARALLEL_COMMIT:
+		case XACT_EVENT_PREPARE:
+		case XACT_EVENT_PRE_COMMIT:
+		case XACT_EVENT_PARALLEL_PRE_COMMIT:
+		case XACT_EVENT_PRE_PREPARE:
+			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
+			{
+				unlogged_build_rel = NULL;
+				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 (errmsg(NEON_TAG "unlogged index build was not properly finished"))));
+			}
+			break;
+	}
+}
+
 static const struct f_smgr neon_smgr =
 {
 	.smgr_init = neon_init,
@@ -3208,6 +3231,10 @@ static const struct f_smgr neon_smgr =
 	.smgr_truncate = neon_truncate,
 	.smgr_immedsync = neon_immedsync,

+	.smgr_start_unlogged_build = neon_start_unlogged_build,
+	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
+	.smgr_end_unlogged_build = neon_end_unlogged_build,
+
 	.smgr_read_slru_segment = neon_read_slru_segment,
 };

@@ -3225,6 +3252,8 @@ smgr_neon(BackendId backend, NRelFileInfo rinfo)
 void
 smgr_init_neon(void)
 {
+	RegisterXactCallback(AtEOXact_neon, NULL);
+
 	smgr_init_standard();
 	neon_init();
 }
@@ -3275,7 +3304,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 		relsize = Max(nbresponse->n_blocks, blkno + 1);

-		set_cached_relsize(rinfo, forknum, relsize, NULL);
+		set_cached_relsize(rinfo, forknum, relsize);
 		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);

 		neon_log(SmgrTrace, "Set length to %d", relsize);
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -39,8 +39,7 @@ typedef struct
 typedef struct
 {
 	RelTag		tag;
-	BlockNumber size : 31;
-	BlockNumber unlogged : 1;
+	BlockNumber size;
 	dlist_node	lru_node;		/* LRU list node */
 } RelSizeEntry;

@@ -118,12 +117,9 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 			*size = entry->size;
 			relsize_ctl->hits += 1;
 			found = true;
-			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
-			{
-				/* Move entry to the LRU list tail */
-				dlist_delete(&entry->lru_node);
-				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
-			}
+			/* Move entry to the LRU list tail */
+			dlist_delete(&entry->lru_node);
+			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		}
 		else
 		{
@@ -134,15 +130,9 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 	return found;
 }

-/*
- * Cache relation size.
- * Returns true if it happens during unlogged build.
- * In this case lock is not released.
- */
-bool
-set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size, BlockNumber* old_size)
+void
+set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
-	bool unlogged = false;
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -158,69 +148,34 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber new_size,
 		 */
 		while ((entry = hash_search(relsize_hash, &tag, HASH_ENTER_NULL, &found)) == NULL)
 		{
-			if (dlist_is_empty(&relsize_ctl->lru))
-			{
-				elog(FATAL, "No more free relsize cache entries");
-			}
-			else
-			{
-				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				Assert(relsize_ctl->size > 0);
-				relsize_ctl->size -= 1;
-			}
+			RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+			hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+			Assert(relsize_ctl->size > 0);
+			relsize_ctl->size -= 1;
 		}
-		if (old_size)
-		{
-			*old_size = found ? entry->size : 0;
-		}
-		entry->size = new_size;
+		entry->size = size;
 		if (!found)
 		{
-			entry->unlogged = false;
-			if (relsize_ctl->size+1 == relsize_hash_size)
+			if (++relsize_ctl->size == relsize_hash_size)
 			{
 				/*
 				 * Remove least recently used elment from the hash.
 				 * Hash size after is becomes `relsize_hash_size-1`.
 				 * But it is not considered to be a problem, because size of this hash is expecrted large enough and +-1 doesn't matter.
 				 */
-				if (dlist_is_empty(&relsize_ctl->lru))
-				{
-					elog(FATAL, "No more free relsize cache entries");
-				}
-				else
-				{
-					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				}
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
 			}
-			else
-			{
-				relsize_ctl->size += 1;
-			}
-		}
-		else if (entry->unlogged) /* entries of relation involved in unlogged build are pinned */
-		{
-			dlist_delete(&entry->lru_node);
-		}
-
-		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
-		{
-			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		}
 		else
 		{
-			Assert(old_size);
-			unlogged = true;
+			dlist_delete(&entry->lru_node);
 		}
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		relsize_ctl->writes += 1;
-		if (!unlogged)
-		{
-			LWLockRelease(relsize_lock);
-		}
+		LWLockRelease(relsize_lock);
 	}
-	return unlogged;
 }

 void
@@ -236,42 +191,23 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 		tag.forknum = forknum;
 		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
 		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
-		if (!found) {
-			entry->unlogged = false;
+		if (!found || entry->size < size)
 			entry->size = size;
-
-			if (relsize_ctl->size+1 == relsize_hash_size)
+		if (!found)
+		{
+			if (++relsize_ctl->size == relsize_hash_size)
 			{
-				if (dlist_is_empty(&relsize_ctl->lru))
-				{
-					elog(FATAL, "No more free relsize cache entries");
-				}
-				else
-				{
-					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				}
-			}
-			else
-			{
-				relsize_ctl->size += 1;
+				RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
+				hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
+				relsize_ctl->size -= 1;
 			}
 		}
 		else
 		{
-			if (entry->size < size)
-				entry->size = size;
-
-			if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
-			{
-				dlist_delete(&entry->lru_node);
-			}
+			dlist_delete(&entry->lru_node);
 		}
 		relsize_ctl->writes += 1;
-		if (!entry->unlogged) /* entries of relation involved in unlogged build are pinned */
-		{
-			dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
-		}
+		dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
 		LWLockRelease(relsize_lock);
 	}
 }
@@ -289,238 +225,13 @@ forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 		entry = hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
 		if (entry)
 		{
-			if (!entry->unlogged)
-			{
-				/* Entried of relations involved in unlogged build are pinned */
-				dlist_delete(&entry->lru_node);
-			}
+			dlist_delete(&entry->lru_node);
 			relsize_ctl->size -= 1;
 		}
 		LWLockRelease(relsize_lock);
 	}
 }

-/*
- * This function starts unlogged build if it was not yet started.
- * The criteria for starting iunlogged build is writing page without normal LSN.
- * It can happen in any backend when page is evicted from shared buffers.
- * Or can not happen at all if index fits in shared buffers.
- *
- * If this function really starts unlogged build, then it returns true, remove entry from LRU list
- * protecting it from eviction until the end of unlogged build.
- * Also it keeps lock on relsize hash. This lock should be later released using resume_unlogged_build().
- * It allows caller to perform some actions
- * in critical section, for example right now it create relation on the disk using mdcreate
- */
-bool
-start_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize)
-{
-	bool start = false;
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-		bool		found;
-
-		tag.rinfo = rinfo;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
-		if (!found) {
-			*relsize = 0;
-			entry->size = blocknum + 1;
-			start = true;
-
-			if (relsize_ctl->size+1 == relsize_hash_size)
-			{
-				if (dlist_is_empty(&relsize_ctl->lru))
-				{
-					elog(FATAL, "No more free relsize cache entries");
-				}
-				else
-				{
-					RelSizeEntry *victim = dlist_container(RelSizeEntry, lru_node, dlist_pop_head_node(&relsize_ctl->lru));
-					hash_search(relsize_hash, &victim->tag, HASH_REMOVE, NULL);
-				}
-			}
-			else
-			{
-				relsize_ctl->size += 1;
-			}
-		}
-		else
-		{
-			start = !entry->unlogged;
-
-			*relsize = entry->size;
-			if (entry->size <= blocknum)
-			{
-				entry->size = blocknum + 1;
-			}
-
-			if (start)
-			{
-				/* relation involved in unlogged build are pinned until the end of the build */
-				dlist_delete(&entry->lru_node);
-			}
-		}
-		entry->unlogged = true;
-		relsize_ctl->writes += 1;
-
-		/*
-		 * We are not putting entry in LRU least to prevent it fro eviction until the end of unlogged build
-		 */
-
-		if (start)
-			elog(LOG, "Start unlogged build for %u/%u/%u.%u",
-				 RelFileInfoFmt(rinfo), forknum);
-	}
-	return start;
-}
-
-/*
- * Check if unlogged build is in progress.
- * If so, true is returned and lock on relsize cache is hold.
- * It should be later released by calling resume_unlogged_build().
- * It allows to read page from local file without risk that it is removed by stop_unlogged_build by some other backend.
- */
-bool
-is_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber* relsize)
-{
-	bool		unlogged = false;
-
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-
-		tag.rinfo = rinfo;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_SHARED);
-		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
-		if (entry != NULL)
-		{
-			unlogged = entry->unlogged;
-			*relsize = entry->size;
-			relsize_ctl->hits += 1;
-		}
-		else
-		{
-			relsize_ctl->misses += 1;
-		}
-		if (!unlogged)
-			LWLockRelease(relsize_lock);
-	}
-	return unlogged;
-}
-
-/*
- * Check if relation is extended during unlogged build.
- * This function object lock on relsize cache which
- * should be later released by calling resume_unlogged_build().
- * It allows to atomically extend local file.
- */
-bool
-is_unlogged_build_extend(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, BlockNumber* relsize)
-{
-	bool		unlogged = false;
-
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-
-		tag.rinfo = rinfo;
-		tag.forknum = forknum;
-
-		LWLockAcquire(relsize_lock, LW_SHARED);
-		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
-		if (entry != NULL)
-		{
-			if (entry->size <= blocknum)
-			{
-				/* Very rare case: it can happen only if relation is thrown away from relcache before unlogged build is detected */
-				/* Repeat search under exclusive lock */
-				LWLockRelease(relsize_lock);
-				LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-				entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
-				if (entry == NULL)
-				{
-					relsize_ctl->misses += 1;
-					LWLockRelease(relsize_lock);
-					return false;
-				}
-			}
-			unlogged = entry->unlogged;
-			*relsize = entry->size;
-			if (entry->size <= blocknum)
-			{
-				entry->size = blocknum + 1;
-			}
-			relsize_ctl->hits += 1;
-		}
-		else
-		{
-			relsize_ctl->misses += 1;
-		}
-	}
-	return unlogged;
-}
-
-/*
- * Check if unlogged build is in progress and if so, clear the flag and return entry to LRU list.
- * If it was unlogged build, true is returned and lock on relsize cache is hold.
- * It should be later released by calling resume_unlogged_build().
- * It allows to atomically unlink local file.
- */
-bool
-stop_unlogged_build(NRelFileInfo rinfo, ForkNumber forknum)
-{
-	bool		unlogged = false;
-
-	if (relsize_hash_size > 0)
-	{
-		RelTag		tag;
-		RelSizeEntry *entry;
-
-		tag.rinfo = rinfo;
-		tag.forknum = forknum;
-		LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
-		entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
-		if (entry != NULL)
-		{
-			unlogged = entry->unlogged;
-			entry->unlogged = false;
-			relsize_ctl->hits += 1;
-			if (unlogged)
-			{
-				elog(LOG, "Stop unlogged build for %u/%u/%u.%u",
-					 RelFileInfoFmt(rinfo), forknum);
-				/* Return entry to the LRU list */
-				dlist_push_tail(&relsize_ctl->lru, &entry->lru_node);
-			}
-		}
-		else
-		{
-			relsize_ctl->misses += 1;
-		}
-		if (!unlogged)
-			LWLockRelease(relsize_lock);
-	}
-	return unlogged;
-}
-
-/*
- * Release lock obtained by start_unlogged_build or is_unlogged-build functions
- */
-void
-resume_unlogged_build(void)
-{
-	if (relsize_hash_size > 0)
-		LWLockRelease(relsize_lock);
-}
-
-
 void
 relsize_hash_init(void)
 {
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -100,17 +100,12 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
 static void WalSndLoop(WalProposer *wp);
 static void XLogBroadcastWalProposer(WalProposer *wp);

-static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
-static void XLogWalPropClose(XLogRecPtr recptr);
-
 static void add_nwr_event_set(Safekeeper *sk, uint32 events);
 static void update_nwr_event_set(Safekeeper *sk, uint32 events);
 static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);

 static void CheckGracefulShutdown(WalProposer *wp);

-static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
-
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -1236,8 +1231,6 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
 static void
 WalSndLoop(WalProposer *wp)
 {
-	XLogRecPtr	flushPtr;
-
 	/* Clear any already-pending wakeups */
 	ResetLatch(MyLatch);

@@ -1333,8 +1326,9 @@ XLogBroadcastWalProposer(WalProposer *wp)
 }

 /*
-  Used to download WAL before basebackup for logical walsenders from sk, no longer
-  needed because walsender always uses neon_walreader.
+  Used to download WAL before basebackup for walproposer/logical walsenders. No
+  longer used, replaced by neon_walreader; but callback still exists because
+  simulation tests use it.
 */
 static bool
 WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
@@ -1342,136 +1336,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 	return true;
 }

-/*
- * These variables are used similarly to openLogFile/SegNo,
- * but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
- * corresponding the filename of walpropFile.
- */
-static int	walpropFile = -1;
-static TimeLineID walpropFileTLI = 0;
-static XLogSegNo walpropSegNo = 0;
-
-/*
- * Write XLOG data to disk.
- */
-static void
-XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
-{
-	int			startoff;
-	int			byteswritten;
-
-	/*
-	 * Apart from walproposer, basebackup LSN page is also written out by
-	 * postgres itself which writes WAL only in pages, and in basebackup it is
-	 * inherently dummy (only safekeepers have historic WAL). Update WAL
-	 * buffers here to avoid dummy page overwriting correct one we download
-	 * here. Ugly, but alternatives are about the same ugly. We won't need
-	 * that if we switch to on-demand WAL download from safekeepers, without
-	 * writing to disk.
-	 *
-	 * https://github.com/neondatabase/neon/issues/5749
-	 */
-	if (!wp->config->syncSafekeepers)
-		XLogUpdateWalBuffers(buf, recptr, nbytes);
-
-	while (nbytes > 0)
-	{
-		int			segbytes;
-
-		/* Close the current segment if it's completed */
-		if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-			XLogWalPropClose(recptr);
-
-		if (walpropFile < 0)
-		{
-#if PG_VERSION_NUM >= 150000
-			/* FIXME Is it ok to use hardcoded value here? */
-			TimeLineID	tli = 1;
-#else
-			bool		use_existent = true;
-#endif
-			/* Create/use new log file */
-			XLByteToSeg(recptr, walpropSegNo, wal_segment_size);
-#if PG_VERSION_NUM >= 150000
-			walpropFile = XLogFileInit(walpropSegNo, tli);
-			walpropFileTLI = tli;
-#else
-			walpropFile = XLogFileInit(walpropSegNo, &use_existent, false);
-			walpropFileTLI = ThisTimeLineID;
-#endif
-		}
-
-		/* Calculate the start offset of the received logs */
-		startoff = XLogSegmentOffset(recptr, wal_segment_size);
-
-		if (startoff + nbytes > wal_segment_size)
-			segbytes = wal_segment_size - startoff;
-		else
-			segbytes = nbytes;
-
-		/* OK to write the logs */
-		errno = 0;
-
-		byteswritten = pg_pwrite(walpropFile, buf, segbytes, (off_t) startoff);
-		if (byteswritten <= 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno;
-
-			/* if write didn't set errno, assume no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-
-			save_errno = errno;
-			XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-			errno = save_errno;
-			ereport(PANIC,
-					(errcode_for_file_access(),
-					 errmsg("could not write to log segment %s "
-							"at offset %u, length %lu: %m",
-							xlogfname, startoff, (unsigned long) segbytes)));
-		}
-
-		/* Update state for write */
-		recptr += byteswritten;
-
-		nbytes -= byteswritten;
-		buf += byteswritten;
-	}
-
-	/*
-	 * Close the current segment if it's fully written up in the last cycle of
-	 * the loop.
-	 */
-	if (walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size))
-	{
-		XLogWalPropClose(recptr);
-	}
-}
-
-/*
- * Close the current segment.
- */
-static void
-XLogWalPropClose(XLogRecPtr recptr)
-{
-	Assert(walpropFile >= 0 && !XLByteInSeg(recptr, walpropSegNo, wal_segment_size));
-
-	if (close(walpropFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-
-		XLogFileName(xlogfname, walpropFileTLI, walpropSegNo, wal_segment_size);
-
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close log segment %s: %m",
-						xlogfname)));
-	}
-
-	walpropFile = -1;
-}
-
 static void
 walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
@@ -1987,58 +1851,6 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
 	elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
 }

-static XLogRecPtr
-GetLogRepRestartLSN(WalProposer *wp)
-{
-	FILE	   *f;
-	XLogRecPtr	lrRestartLsn = InvalidXLogRecPtr;
-
-	/* We don't need to do anything in syncSafekeepers mode. */
-	if (wp->config->syncSafekeepers)
-		return InvalidXLogRecPtr;
-
-	/*
-	 * If there are active logical replication subscription we need to provide
-	 * enough WAL for their WAL senders based on th position of their
-	 * replication slots.
-	 */
-	f = fopen("restart.lsn", "rb");
-	if (f != NULL)
-	{
-		size_t		rc = fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
-
-		fclose(f);
-		if (rc == 1 && lrRestartLsn != InvalidXLogRecPtr)
-		{
-			uint64		download_range_mb;
-
-			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
-
-			/*
-			 * If we need to download more than a max_slot_wal_keep_size,
-			 * don't do it to avoid risk of exploding pg_wal. Logical
-			 * replication won't work until recreated, but at least compute
-			 * would start; this also follows max_slot_wal_keep_size
-			 * semantics.
-			 */
-			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
-			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-			{
-				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
-				return InvalidXLogRecPtr;
-			}
-
-			/*
-			 * start from the beginning of the segment to fetch page headers
-			 * verifed by XLogReader
-			 */
-			lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
-		}
-	}
-	return lrRestartLsn;
-}
-
 void
 SetNeonCurrentClusterSize(uint64 size)
 {
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -24,8 +24,12 @@
 #include "walproposer.h"

 static NeonWALReader *wal_reader = NULL;
+
+struct WalSnd;
+extern struct WalSnd *MyWalSnd;
 extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
 extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
+extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);

 static XLogRecPtr
 NeonWALReadWaitForWAL(XLogRecPtr loc)
@@ -36,7 +40,28 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
 		CHECK_FOR_INTERRUPTS();
 	}

-	return WalSndWaitForWal(loc);
+	// Walsender sends keepalives and stuff, so better use its normal wait
+	if (MyWalSnd != NULL)
+		return WalSndWaitForWal(loc);
+
+	for (;;)
+	{
+		XLogRecPtr flush_ptr;
+		if (!RecoveryInProgress())
+#if PG_VERSION_NUM >= 150000
+			flush_ptr = GetFlushRecPtr(NULL);
+#else
+			flush_ptr = GetFlushRecPtr();
+#endif
+		else
+			flush_ptr = GetXLogReplayRecPtr(NULL);
+
+		if (loc <= flush_ptr)
+			return flush_ptr;
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(1000);
+	}
 }

 static int
--- a/poetry.lock
+++ b/poetry.lock
@@ -2806,13 +2806,13 @@ files = [

 [[package]]
 name = "urllib3"
-version = "1.26.18"
+version = "1.26.19"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
-    {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"},
-    {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"},
+    {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"},
+    {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"},
 ]

 [package.extras]
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -356,7 +356,7 @@ async fn main() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,16 +1,183 @@
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::fmt::{self, Display};

 use crate::auth::IpPattern;

 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::proxy::retry::ShouldRetry;

 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
 pub struct ConsoleError {
    pub error: Box<str>,
+    #[serde(skip)]
+    pub http_status_code: http::StatusCode,
+    pub status: Option<Status>,
+}
+
+impl ConsoleError {
+    pub fn get_reason(&self) -> Reason {
+        self.status
+            .as_ref()
+            .and_then(|s| s.details.error_info.as_ref())
+            .map(|e| e.reason)
+            .unwrap_or(Reason::Unknown)
+    }
+    pub fn get_user_facing_message(&self) -> String {
+        use super::provider::errors::REQUEST_FAILED;
+        self.status
+            .as_ref()
+            .and_then(|s| s.details.user_facing_message.as_ref())
+            .map(|m| m.message.clone().into())
+            .unwrap_or_else(|| {
+                // Ask @neondatabase/control-plane for review before adding more.
+                match self.http_status_code {
+                    http::StatusCode::NOT_FOUND => {
+                        // Status 404: failed to get a project-related resource.
+                        format!("{REQUEST_FAILED}: endpoint cannot be found")
+                    }
+                    http::StatusCode::NOT_ACCEPTABLE => {
+                        // Status 406: endpoint is disabled (we don't allow connections).
+                        format!("{REQUEST_FAILED}: endpoint is disabled")
+                    }
+                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
+                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
+                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
+                    }
+                    _ => REQUEST_FAILED.to_owned(),
+                }
+            })
+    }
+}
+
+impl Display for ConsoleError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let msg = self
+            .status
+            .as_ref()
+            .and_then(|s| s.details.user_facing_message.as_ref())
+            .map(|m| m.message.as_ref())
+            .unwrap_or_else(|| &self.error);
+        write!(f, "{}", msg)
+    }
+}
+
+impl ShouldRetry for ConsoleError {
+    fn could_retry(&self) -> bool {
+        if self.status.is_none() || self.status.as_ref().unwrap().details.retry_info.is_none() {
+            // retry some temporary failures because the compute was in a bad state
+            // (bad request can be returned when the endpoint was in transition)
+            return match &self {
+                ConsoleError {
+                    http_status_code: http::StatusCode::BAD_REQUEST,
+                    ..
+                } => true,
+                // don't retry when quotas are exceeded
+                ConsoleError {
+                    http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                    ref error,
+                    ..
+                } => !error.contains("compute time quota of non-primary branches is exceeded"),
+                // locked can be returned when the endpoint was in transition
+                // or when quotas are exceeded. don't retry when quotas are exceeded
+                ConsoleError {
+                    http_status_code: http::StatusCode::LOCKED,
+                    ref error,
+                    ..
+                } => {
+                    !error.contains("quota exceeded")
+                        && !error.contains("the limit for current plan reached")
+                }
+                _ => false,
+            };
+        }
+
+        // retry if the response has a retry delay
+        if let Some(retry_info) = self
+            .status
+            .as_ref()
+            .and_then(|s| s.details.retry_info.as_ref())
+        {
+            retry_info.retry_delay_ms > 0
+        } else {
+            false
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Status {
+    pub code: Box<str>,
+    pub message: Box<str>,
+    pub details: Details,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Details {
+    pub error_info: Option<ErrorInfo>,
+    pub retry_info: Option<RetryInfo>,
+    pub user_facing_message: Option<UserFacingMessage>,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct ErrorInfo {
+    pub reason: Reason,
+    // Schema could also have `metadata` field, but it's not structured. Skip it for now.
+}
+
+#[derive(Clone, Copy, Debug, Deserialize, Default)]
+pub enum Reason {
+    #[serde(rename = "ROLE_PROTECTED")]
+    RoleProtected,
+    #[serde(rename = "RESOURCE_NOT_FOUND")]
+    ResourceNotFound,
+    #[serde(rename = "PROJECT_NOT_FOUND")]
+    ProjectNotFound,
+    #[serde(rename = "ENDPOINT_NOT_FOUND")]
+    EndpointNotFound,
+    #[serde(rename = "BRANCH_NOT_FOUND")]
+    BranchNotFound,
+    #[serde(rename = "RATE_LIMIT_EXCEEDED")]
+    RateLimitExceeded,
+    #[serde(rename = "NON_PRIMARY_BRANCH_COMPUTE_TIME_EXCEEDED")]
+    NonPrimaryBranchComputeTimeExceeded,
+    #[serde(rename = "ACTIVE_TIME_QUOTA_EXCEEDED")]
+    ActiveTimeQuotaExceeded,
+    #[serde(rename = "COMPUTE_TIME_QUOTA_EXCEEDED")]
+    ComputeTimeQuotaExceeded,
+    #[serde(rename = "WRITTEN_DATA_QUOTA_EXCEEDED")]
+    WrittenDataQuotaExceeded,
+    #[serde(rename = "DATA_TRANSFER_QUOTA_EXCEEDED")]
+    DataTransferQuotaExceeded,
+    #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
+    LogicalSizeQuotaExceeded,
+    #[default]
+    #[serde(other)]
+    Unknown,
+}
+
+impl Reason {
+    pub fn is_not_found(&self) -> bool {
+        matches!(
+            self,
+            Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::BranchNotFound
+        )
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct RetryInfo {
+    pub retry_delay_ms: u64,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct UserFacingMessage {
+    pub message: Box<str>,
 }

 /// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -25,8 +25,8 @@ use tracing::info;

 pub mod errors {
    use crate::{
+        console::messages::{self, ConsoleError},
        error::{io_error, ReportableError, UserFacingError},
-        http,
        proxy::retry::ShouldRetry,
    };
    use thiserror::Error;
@@ -34,17 +34,14 @@ pub mod errors {
    use super::ApiLockError;

    /// A go-to error message which doesn't leak any detail.
-    const REQUEST_FAILED: &str = "Console request failed";
+    pub const REQUEST_FAILED: &str = "Console request failed";

    /// Common console API error.
    #[derive(Debug, Error)]
    pub enum ApiError {
        /// Error returned by the console itself.
-        #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
-        Console {
-            status: http::StatusCode,
-            text: Box<str>,
-        },
+        #[error("{REQUEST_FAILED} with {0}")]
+        Console(ConsoleError),

        /// Various IO errors like broken pipe or malformed payload.
        #[error("{REQUEST_FAILED}: {0}")]
@@ -53,11 +50,11 @@ pub mod errors {

    impl ApiError {
        /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<http::StatusCode> {
+        pub fn get_reason(&self) -> messages::Reason {
            use ApiError::*;
            match self {
-                Console { status, .. } => Some(*status),
-                _ => None,
+                Console(e) => e.get_reason(),
+                _ => messages::Reason::Unknown,
            }
        }
    }
@@ -67,22 +64,7 @@ pub mod errors {
            use ApiError::*;
            match self {
                // To minimize risks, only select errors are forwarded to users.
-                // Ask @neondatabase/control-plane for review before adding more.
-                Console { status, .. } => match *status {
-                    http::StatusCode::NOT_FOUND => {
-                        // Status 404: failed to get a project-related resource.
-                        format!("{REQUEST_FAILED}: endpoint cannot be found")
-                    }
-                    http::StatusCode::NOT_ACCEPTABLE => {
-                        // Status 406: endpoint is disabled (we don't allow connections).
-                        format!("{REQUEST_FAILED}: endpoint is disabled")
-                    }
-                    http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
-                        // Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
-                        format!("{REQUEST_FAILED}: endpoint is temporarily unavailable. Check your quotas and/or contact our support.")
-                    }
-                    _ => REQUEST_FAILED.to_owned(),
-                },
+                Console(c) => c.get_user_facing_message(),
                _ => REQUEST_FAILED.to_owned(),
            }
        }
@@ -91,29 +73,56 @@ pub mod errors {
    impl ReportableError for ApiError {
        fn get_error_kind(&self) -> crate::error::ErrorKind {
            match self {
-                ApiError::Console {
-                    status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                    ..
-                } => crate::error::ErrorKind::User,
-                ApiError::Console {
-                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    text,
-                } if text.contains("compute time quota of non-primary branches is exceeded") => {
-                    crate::error::ErrorKind::User
+                ApiError::Console(e) => {
+                    use crate::error::ErrorKind::*;
+                    match e.get_reason() {
+                        crate::console::messages::Reason::RoleProtected => User,
+                        crate::console::messages::Reason::ResourceNotFound => User,
+                        crate::console::messages::Reason::ProjectNotFound => User,
+                        crate::console::messages::Reason::EndpointNotFound => User,
+                        crate::console::messages::Reason::BranchNotFound => User,
+                        crate::console::messages::Reason::RateLimitExceeded => ServiceRateLimit,
+                        crate::console::messages::Reason::NonPrimaryBranchComputeTimeExceeded => {
+                            User
+                        }
+                        crate::console::messages::Reason::ActiveTimeQuotaExceeded => User,
+                        crate::console::messages::Reason::ComputeTimeQuotaExceeded => User,
+                        crate::console::messages::Reason::WrittenDataQuotaExceeded => User,
+                        crate::console::messages::Reason::DataTransferQuotaExceeded => User,
+                        crate::console::messages::Reason::LogicalSizeQuotaExceeded => User,
+                        crate::console::messages::Reason::Unknown => match &e {
+                            ConsoleError {
+                                http_status_code:
+                                    http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
+                                ..
+                            } => crate::error::ErrorKind::User,
+                            ConsoleError {
+                                http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
+                                error,
+                                ..
+                            } if error.contains(
+                                "compute time quota of non-primary branches is exceeded",
+                            ) =>
+                            {
+                                crate::error::ErrorKind::User
+                            }
+                            ConsoleError {
+                                http_status_code: http::StatusCode::LOCKED,
+                                error,
+                                ..
+                            } if error.contains("quota exceeded")
+                                || error.contains("the limit for current plan reached") =>
+                            {
+                                crate::error::ErrorKind::User
+                            }
+                            ConsoleError {
+                                http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
+                                ..
+                            } => crate::error::ErrorKind::ServiceRateLimit,
+                            ConsoleError { .. } => crate::error::ErrorKind::ControlPlane,
+                        },
+                    }
                }
-                ApiError::Console {
-                    status: http::StatusCode::LOCKED,
-                    text,
-                } if text.contains("quota exceeded")
-                    || text.contains("the limit for current plan reached") =>
-                {
-                    crate::error::ErrorKind::User
-                }
-                ApiError::Console {
-                    status: http::StatusCode::TOO_MANY_REQUESTS,
-                    ..
-                } => crate::error::ErrorKind::ServiceRateLimit,
-                ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
            }
        }
@@ -124,31 +133,7 @@ pub mod errors {
            match self {
                // retry some transport errors
                Self::Transport(io) => io.could_retry(),
-                // retry some temporary failures because the compute was in a bad state
-                // (bad request can be returned when the endpoint was in transition)
-                Self::Console {
-                    status: http::StatusCode::BAD_REQUEST,
-                    ..
-                } => true,
-                // don't retry when quotas are exceeded
-                Self::Console {
-                    status: http::StatusCode::UNPROCESSABLE_ENTITY,
-                    ref text,
-                } => !text.contains("compute time quota of non-primary branches is exceeded"),
-                // locked can be returned when the endpoint was in transition
-                // or when quotas are exceeded. don't retry when quotas are exceeded
-                Self::Console {
-                    status: http::StatusCode::LOCKED,
-                    ref text,
-                } => {
-                    // written data quota exceeded
-                    // data transfer quota exceeded
-                    // compute time quota exceeded
-                    // logical size quota exceeded
-                    !text.contains("quota exceeded")
-                        && !text.contains("the limit for current plan reached")
-                }
-                _ => false,
+                Self::Console(e) => e.could_retry(),
            }
        }
    }
@@ -509,7 +494,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
        self.metrics
            .semaphore_acquire_seconds
            .observe(now.elapsed().as_secs_f64());
-
+        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
        Ok(WakeComputePermit { permit: permit? })
    }

--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -94,12 +94,14 @@ impl Api {
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
-                Err(e) => match e.http_status_code() {
-                    Some(http::StatusCode::NOT_FOUND) => {
+                // TODO(anna): retry
+                Err(e) => {
+                    if e.get_reason().is_not_found() {
                        return Ok(AuthInfo::default());
+                    } else {
+                        return Err(e.into());
                    }
-                    _otherwise => return Err(e.into()),
-                },
+                }
            };

            let secret = if body.role_secret.is_empty() {
@@ -328,19 +330,24 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
        info!("request succeeded, processing the body");
        return Ok(response.json().await?);
    }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);

    // Don't throw an error here because it's not as important
    // as the fact that the request itself has failed.
-    let body = response.json().await.unwrap_or_else(|e| {
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
        warn!("failed to parse error body: {e}");
        ConsoleError {
            error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
        }
    });
+    body.http_status_code = status;

-    let text = body.error;
-    error!("console responded with an error ({status}): {text}");
-    Err(ApiError::Console { status, text })
+    error!("console responded with an error ({status}): {body:?}");
+    Err(ApiError::Console(body))
 }

 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -91,7 +91,7 @@ pub async fn task_main(
        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();

        connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await{
+            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
                Ok((socket, Some(addr))) => (socket, addr.ip()),
                Err(e) => {
                    error!("per-client task finished with an error: {e:#}");
@@ -101,36 +101,38 @@ pub async fn task_main(
                    error!("missing required client IP");
                    return;
                }
-                Ok((socket, None)) => (socket, peer_addr.ip())
+                Ok((socket, None)) => (socket, peer_addr.ip()),
            };

            match socket.inner.set_nodelay(true) {
-                Ok(()) => {},
+                Ok(()) => {}
                Err(e) => {
                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
                    return;
-                },
+                }
            };

            let mut ctx = RequestMonitoring::new(
-                    session_id,
-                    peer_addr,
-                    crate::metrics::Protocol::Tcp,
-                    &config.region,
-                );
+                session_id,
+                peer_addr,
+                crate::metrics::Protocol::Tcp,
+                &config.region,
+            );
            let span = ctx.span.clone();

-            let res = handle_client(
-                config,
-                &mut ctx,
-                cancellation_handler,
-                socket,
-                ClientMode::Tcp,
-                endpoint_rate_limiter2,
-                conn_gauge,
-            )
-            .instrument(span.clone())
-            .await;
+            let startup = Box::pin(
+                handle_client(
+                    config,
+                    &mut ctx,
+                    cancellation_handler,
+                    socket,
+                    ClientMode::Tcp,
+                    endpoint_rate_limiter2,
+                    conn_gauge,
+                )
+                .instrument(span.clone()),
+            );
+            let res = startup.await;

            match res {
                Err(e) => {
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -98,7 +98,7 @@ pub(super) struct CopyBuffer {
    amt: u64,
    buf: Box<[u8]>,
 }
-const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+const DEFAULT_BUF_SIZE: usize = 1024;

 impl CopyBuffer {
    pub(super) fn new() -> Self {
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;`