Storage release 2025-05-23 06:10 UTC

Add online_advisor extension (#11898 )
## Problem Detect problems with Postgres optimiser: lack of indexes and statistics ## Summary of changes https://github.com/knizhnik/online_advisor Add online_advistor extension to docker image --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2026-05-24 16:40:38 +00:00 · 2025-05-23 06:11:00 +00:00 · 2025-05-23 05:08:32 +00:00 · 2025-05-22 19:15:05 +00:00 · 2025-05-22 15:20:50 +00:00 · 2025-05-22 12:45:00 +00:00
160 changed files with 8066 additions and 1716 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -49,10 +49,6 @@ inputs:
    description: 'A JSON object with project settings'
    required: false
    default: '{}'
-  default_endpoint_settings:
-    description: 'A JSON object with the default endpoint settings'
-    required: false
-    default: '{}'

 outputs:
  dsn:
@@ -139,21 +135,6 @@ runs:
            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
            -d "{\"scheduling\": \"Essential\"}"
        fi
-        # XXX
-        # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API.
-        # https://github.com/neondatabase/cloud/issues/27108
-        if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then
-          PROJECT_DATA=$(curl -X GET \
-              "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \
-              -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-              -d "{\"scheduling\": \"Essential\"}"
-          )
-          NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}")
-          curl -X POST --fail \
-                "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \
-                -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-                --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}"
-        fi
        

      env:
@@ -171,4 +152,3 @@ runs:
        PSQL: ${{ inputs.psql_path }}
        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
        PROJECT_SETTINGS: ${{ inputs.project_settings }}
-        DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -279,18 +279,14 @@ jobs:
          # run all non-pageserver tests
          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'

-          # run pageserver tests with different settings
-          for get_vectored_concurrent_io in sequential sidecar-task; do
-            for io_engine in std-fs tokio-epoll-uring ; do
-                for io_mode in buffered direct direct-rw ; do
-                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \
-                  ${cov_prefix} \
-                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-              done
-            done
-          done
+          # run pageserver tests
+          # (When developing new pageserver features gated by config fields, we commonly make the rust
+          # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME.
+          # Then run the nextest invocation below for all relevant combinations. Singling out the
+          # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
+          NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring  \
+          ${cov_prefix} \
+          cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -405,8 +401,6 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,8 +323,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -965,7 +963,7 @@ jobs:
          fi

      - name: Verify docker-compose example and test extensions
-        timeout-minutes: 20
+        timeout-minutes: 60
        env:
          TAG: >-
            ${{
--- a/.github/workflows/cloud-extensions.yml
+++ b/.github/workflows/cloud-extensions.yml
@@ -35,7 +35,7 @@ jobs:
      matrix:
        pg-version: [16, 17]

-    runs-on: [ self-hosted, small ]
+    runs-on: us-east-2
    container:
      # We use the neon-test-extensions image here as it contains the source code for the extensions.
      image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest
@@ -71,20 +71,7 @@ jobs:
          region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
          postgres_version: ${{ matrix.pg-version }}
          project_settings: ${{ steps.project-settings.outputs.settings }}
-          # We need these settings to get the expected output results.
-          # We cannot use the environment variables e.g. PGTZ due to
-          # https://github.com/neondatabase/neon/issues/1287
-          default_endpoint_settings: >
-            {
-              "pg_settings": {
-                "DateStyle": "Postgres,MDY",
-                "TimeZone": "America/Los_Angeles",
-                "compute_query_id": "off",
-                "neon.allow_unstable_extensions": "on"
-              }
-            }
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}

      - name: Run the regression tests
        run: /run-tests.sh -r /ext-src
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -63,8 +63,10 @@ jobs:

      - name: Filter out only v-string for build matrix
        id: postgres_changes
+        env:
+          CHANGES: ${{ steps.files_changed.outputs.changes }}
        run: |
-          v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
+          v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
          echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"

  check-macos-build:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1112,6 +1112,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "cgroups-rs"
 version = "0.3.3"
@@ -1306,7 +1312,7 @@ dependencies = [
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "notify",
 "num_cpus",
 "once_cell",
@@ -1429,7 +1435,7 @@ dependencies = [
 "humantime-serde",
 "hyper 0.14.30",
 "jsonwebtoken",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
@@ -3512,9 +3518,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.169"
+version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"

 [[package]]
 name = "libloading"
@@ -3788,6 +3794,16 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "neon-shmem"
+version = "0.1.0"
+dependencies = [
+ "nix 0.30.1",
+ "tempfile",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -3821,12 +3837,13 @@ dependencies = [

 [[package]]
 name = "nix"
-version = "0.27.1"
+version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
 "bitflags 2.8.0",
 "cfg-if",
+ "cfg_aliases",
 "libc",
 "memoffset 0.9.0",
 ]
@@ -3881,6 +3898,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -4165,6 +4192,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -4269,6 +4302,7 @@ dependencies = [
 "enumset",
 "fail",
 "futures",
+ "hashlink",
 "hex",
 "hex-literal",
 "http-utils",
@@ -4280,7 +4314,7 @@ dependencies = [
 "jsonwebtoken",
 "md5",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -4331,6 +4365,7 @@ dependencies = [
 "toml_edit",
 "tracing",
 "tracing-utils",
+ "twox-hash",
 "url",
 "utils",
 "uuid",
@@ -4355,7 +4390,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "itertools 0.10.5",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "postgres_backend",
 "postgres_ffi",
@@ -4416,6 +4451,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "tonic",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -4848,6 +4893,19 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "posthog_client_lite"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "sha2",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "powerfmt"
 version = "0.2.0"
@@ -5197,6 +5255,7 @@ dependencies = [
 "tracing-log",
 "tracing-opentelemetry",
 "tracing-subscriber",
+ "tracing-test",
 "tracing-utils",
 "try-lock",
 "typed-json",
@@ -7647,6 +7706,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -7660,6 +7720,27 @@ dependencies = [
 "tracing-serde",
 ]

+[[package]]
+name = "tracing-test"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
+dependencies = [
+ "tracing-core",
+ "tracing-subscriber",
+ "tracing-test-macro",
+]
+
+[[package]]
+name = "tracing-test-macro"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
+dependencies = [
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "tracing-utils"
 version = "0.1.0"
@@ -7885,7 +7966,7 @@ dependencies = [
 "humantime",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pem",
 "pin-project-lite",
@@ -8439,8 +8520,10 @@ dependencies = [
 "fail",
 "form_urlencoded",
 "futures-channel",
+ "futures-core",
 "futures-executor",
 "futures-io",
+ "futures-task",
 "futures-util",
 "generic-array",
 "getrandom 0.2.11",
@@ -8459,6 +8542,7 @@ dependencies = [
 "log",
 "memchr",
 "nix 0.26.4",
+ "nix 0.30.1",
 "nom",
 "num",
 "num-bigint",
@@ -8470,6 +8554,7 @@ dependencies = [
 "once_cell",
 "p256 0.13.2",
 "parquet",
+ "percent-encoding",
 "prettyplease",
 "proc-macro2",
 "prost 0.13.3",
@@ -8508,6 +8593,7 @@ dependencies = [
 "tracing",
 "tracing-core",
 "tracing-log",
+ "tracing-subscriber",
 "url",
 "uuid",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
+    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -23,9 +24,11 @@ members = [
    "libs/postgres_ffi",
    "libs/safekeeper_api",
    "libs/desim",
+    "libs/neon-shmem",
    "libs/utils",
    "libs/consumption_metrics",
    "libs/postgres_backend",
+    "libs/posthog_client_lite",
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
@@ -126,7 +129,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
@@ -250,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.86.0
+ENV RUSTC_VERSION=1.87.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -582,6 +582,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control

+#########################################################################################
+#
+# Layer "online_advisor-build"
+# compile online_advisor extension
+#
+#########################################################################################
+FROM build-deps AS online_advisor-src
+ARG PG_VERSION
+
+# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
+# last release 1.0 - May 15, 2025
+WORKDIR /ext-src
+RUN case "${PG_VERSION:?}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+	wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
+    echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
+    mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS online_advisor-build
+COPY --from=online_advisor-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d online_advisor-src ]; then \
+	    cd online_advisor-src && \
+        make -j install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
+    fi
+
 #########################################################################################
 #
 # Layer "pg_hashids-build"
@@ -1117,8 +1149,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.
    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
    echo "#nothing to test here" > neon-test.sh

-RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz &&  \
-    echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz &&  \
+    echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \
    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C .

 FROM rust-extensions-build-pgrx14 AS pgrag-build
@@ -1648,6 +1680,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1823,6 +1856,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
 COPY --from=pg_graphql-src /ext-src/ /ext-src/
 #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=online_advisor-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
 COPY --from=pgtap-src /ext-src/ /ext-src/
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
 			 RelationGetRelationName(index));
 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
+	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
 	/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
 	}
 
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
+	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	/*
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -213,8 +213,10 @@ impl Escaping for PgIdent {

        // Find the first suitable tag that is not present in the string.
        // Postgres' max role/DB name length is 63 bytes, so even in the
-        // worst case it won't take long.
-        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
+        // worst case it won't take long. Outer tag is always `tag + "x"`,
+        // so if `tag` is not present in the string, `outer_tag` is not
+        // present in the string either.
+        while self.contains(&tag.to_string()) {
            tag += "x";
            outer_tag = tag.clone() + "x";
        }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -27,6 +27,40 @@ fn get_rsyslog_pid() -> Option<String> {
    }
 }

+fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
+    const MAX_WAIT: Duration = Duration::from_secs(5);
+    const INITIAL_SLEEP: Duration = Duration::from_millis(2);
+
+    let mut sleep_duration = INITIAL_SLEEP;
+    let start = std::time::Instant::now();
+    let mut attempts = 1;
+
+    for attempt in 1.. {
+        attempts = attempt;
+        match get_rsyslog_pid() {
+            Some(pid) => return Ok(pid),
+            None => {
+                if start.elapsed() >= MAX_WAIT {
+                    break;
+                }
+                info!(
+                    "rsyslogd is not running, attempt {}. Sleeping for {} ms",
+                    attempt,
+                    sleep_duration.as_millis()
+                );
+                std::thread::sleep(sleep_duration);
+                sleep_duration *= 2;
+            }
+        }
+    }
+
+    Err(anyhow::anyhow!(
+        "rsyslogd is not running after waiting for {} seconds and {} attempts",
+        attempts,
+        start.elapsed().as_secs()
+    ))
+}
+
 // Restart rsyslogd to apply the new configuration.
 // This is necessary, because there is no other way to reload the rsyslog configuration.
 //
@@ -36,14 +70,14 @@ fn get_rsyslog_pid() -> Option<String> {
 // TODO: test it properly
 //
 fn restart_rsyslog() -> Result<()> {
-    let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
-    info!("rsyslogd is running with pid: {}, restart it", old_pid);
-
    // kill it to restart
    let _ = Command::new("pkill")
        .arg("rsyslogd")
        .output()
-        .context("Failed to stop rsyslogd")?;
+        .context("Failed to restart rsyslogd")?;
+
+    // ensure rsyslogd is running
+    wait_for_rsyslog_pid()?;

    Ok(())
 }
@@ -131,15 +165,11 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result
        return Ok(());
    }

-    // When new config is empty we can simply remove the configuration file.
+    // Nothing to configure
    if new_config.is_empty() {
-        info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
-        match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
-            Ok(_) => {}
-            Err(err) if err.kind() == ErrorKind::NotFound => {}
-            Err(err) => return Err(err.into()),
-        }
-        restart_rsyslog()?;
+        // When the configuration is removed, PostgreSQL will stop sending data
+        // to the files watched by rsyslog, so restarting rsyslog is more effort
+        // than just ignoring this change.
        return Ok(());
    }

--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -71,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
            ("name$$$", ("$x$name$$$$x$", "xx")),
            ("name$$$$", ("$x$name$$$$$x$", "xx")),
            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
+            ("x", ("$xx$x$xx$", "xxx")),
+            ("xx", ("$xxx$xx$xxx$", "xxxx")),
+            ("$x", ("$xx$$x$xx$", "xxx")),
+            ("x$", ("$xx$x$$xx$", "xxx")),
+            ("$x$", ("$xx$$x$$xx$", "xxx")),
+            ("xx$", ("$xxx$xx$$xxx$", "xxxx")),
+            ("$xx", ("$xxx$$xx$xxx$", "xxxx")),
+            ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
        ];

        for (input, expected) in test_cases {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,7 +14,7 @@

 use std::ffi::OsStr;
 use std::io::Write;
-use std::os::unix::prelude::AsRawFd;
+use std::os::fd::AsFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
 use std::process::Command;
@@ -356,7 +356,7 @@ where
            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
            // remains locked after exec.
-            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+            nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
                .expect("remove FD_CLOEXEC");
            // Don't run drop(file), it would close the file before we actually exec.
            std::mem::forget(file);
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,7 +8,6 @@
 use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
-use std::os::fd::AsRawFd;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
-use nix::fcntl::{FlockArg, flock};
+use nix::fcntl::{Flock, FlockArg};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -749,16 +748,16 @@ struct TimelineTreeEl {

 /// A flock-based guard over the neon_local repository directory
 struct RepoLock {
-    _file: File,
+    _file: Flock<File>,
 }

 impl RepoLock {
    fn new() -> Result<Self> {
        let repo_dir = File::open(local_env::base_path())?;
-        let repo_dir_fd = repo_dir.as_raw_fd();
-        flock(repo_dir_fd, FlockArg::LockExclusive)?;
-
-        Ok(Self { _file: repo_dir })
+        match Flock::lock(repo_dir, FlockArg::LockExclusive) {
+            Ok(f) => Ok(Self { _file: f }),
+            Err((_, e)) => Err(e).context("flock error"),
+        }
    }
 }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -546,6 +546,16 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Falied to parse 'sampling_ratio'")?,
+            relsize_snapshot_cache_capacity: settings
+                .remove("relsize snapshot cache capacity")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
+            basebackup_cache_enabled: settings
+                .remove("basebackup_cache_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'basebackup_cache_enabled' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/docker-compose/ext-src/alter_db.sh
+++ b/docker-compose/ext-src/alter_db.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# We need these settings to get the expected output results.
+# We cannot use the environment variables e.g. PGTZ due to
+# https://github.com/neondatabase/neon/issues/1287
+export DATABASE=${1:-contrib_regression}
+psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \
+     -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \
+     -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \
--- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh
+++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh
@@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/}
 TESTS=${TESTS/sqli_connection/}
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
 ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}

--- a/docker-compose/ext-src/pgrag-src/regular-test.sh
+++ b/docker-compose/ext-src/pgrag-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname "${0}")"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions
--- a/docker-compose/ext-src/pgx_ulid-src/Makefile
+++ b/docker-compose/ext-src/pgx_ulid-src/Makefile
@@ -20,5 +20,6 @@ installcheck: regression-test
 regression-test:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)"
 	$(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/plv8-src/regular-test.sh
+++ b/docker-compose/ext-src/plv8-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')"
 REGRESS="${REGRESS/startup_perms/}"
--- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
+++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
+++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/rum-src/regular-test.sh
+++ b/docker-compose/ext-src/rum-src/regular-test.sh
@@ -3,5 +3,6 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
--- a/docs/rfcs/043-bottom-most-gc-compaction.md
+++ b/docs/rfcs/043-bottom-most-gc-compaction.md
@@ -0,0 +1,194 @@
+# Bottommost Garbage-Collection Compaction
+
+## Summary
+
+The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future.
+
+## Motivation
+
+The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification.
+
+# Basic Idea
+
+![](images/036-bottom-most-gc-compaction/01-basic-idea.svg)
+
+The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process,
+
+- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages.
+- We produce images for all keys involved in the compaction process at the GC horizon.
+
+Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback).
+
+![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png)
+
+The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.
+
+# Branches
+
+With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. 
+
+![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg)
+
+## Single Timeline w/ Snapshots: handle `retain_lsn`
+
+First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”).
+
+The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below:
+
+```
+LSN 0x10 -> A
+LSN 0x20 -> append B
+retain_lsn: 0x20
+LSN 0x30 -> append C
+LSN 0x40 -> append D
+retain_lsn: 0x40
+LSN 0x50 -> append E
+GC horizon: 0x50
+LSN 0x60 -> append F
+```
+
+The algorithm will produce:
+
+```
+LSN 0x20 -> AB
+(drop all history below the earliest retain_lsn)
+LSN 0x40 -> ABCD
+(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here)
+LSN 0x50 -> append E
+(replay one delta is cheap)
+LSN 0x60 -> append F
+(keep everything as-is above the GC horizon)
+```
+
+![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg)
+
+What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped.
+
+In the example above, the `$threshold` is 2.
+
+## Child Branches with data: pull + partial images
+
+In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that.
+
+We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. 
+
+```
+branch_lsn: 0x20
+LSN 0x30 -> append P
+LSN 0x40 -> append Q
+LSN 0x50 -> append R
+GC horizon: 0x50
+LSN 0x60 -> append S
+```
+
+Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch.
+
+```
+branch_lsn: 0x20
+LSN 0x50 -> ABPQR
+(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta)
+GC horizon: 0x50
+LSN 0x60 -> append S
+```
+
+![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg)
+
+Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch.
+
+# Result
+
+Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before.
+
+Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range
+
+After: sum(min(logs for each key, image for each key))
+
+# Compaction Trigger
+
+The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)).
+
+We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification.
+
+Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon.
+
+The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space.
+
+![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg)
+
+The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space.
+
+![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg)
+
+Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon.
+
+The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**.
+
+To reason about this trigger, consider the two cases:
+
+**Data Ingestion**
+
+User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written.
+
+![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg)
+
+**Updates/Deletion**
+
+In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. 
+
+![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg)
+
+Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size.
+
+The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor.
+
+20GB layers → +20GB layers → delete 20GB, need 40GB temporary space
+
+# Sub-Compactions
+
+The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs.
+
+![](images/036-bottom-most-gc-compaction/13-job-split.svg)
+
+As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5).
+
+Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range.
+
+# Implementation
+
+The main implementation of gc-compaction is in `compaction.rs`.
+
+* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range.
+* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files.
+* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible.
+* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried.
+* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction.
+* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information.
+
+Gc-compaction can also be scheduled over the HTTP API. Example:
+
+```
+curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }'
+```
+
+The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map.
+
+The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works).
+
+# Next Steps
+
+There are still some limitations of gc-compaction itself that needs to be resolved and tested,
+
+- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging.
+- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones.
+- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history.
+- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long.
+- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process.
+- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction.
+- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer.
+- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history.
+
+In the future,
+
+- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN.
+- Tiered compaction on deltas: ensure read from any LSN is fast.
+- Per-timeline compaction → tenant-wide compaction?
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg
@@ -0,0 +1,135 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="82 284 863 375" width="863" height="375">
+  <defs/>
+  <g id="01-basic-idea" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>01-basic-idea</title>
+    <rect fill="white" x="82" y="284" width="863" height="375"/>
+    <g id="01-basic-idea_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_2">
+        <rect x="234" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="234" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <rect x="453.5" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="453.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_4">
+        <rect x="672.5" y="379.5" width="203.5" height="17.5" fill="white"/>
+        <rect x="672.5" y="379.5" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_5">
+        <rect x="234" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="234" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_6">
+        <rect x="375" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="375" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <rect x="516" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="516" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_8">
+        <rect x="657" y="288.5" width="127" height="77.5" fill="white"/>
+        <rect x="657" y="288.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_9">
+        <rect x="798" y="288.5" width="78" height="77.5" fill="white"/>
+        <rect x="798" y="288.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_11">
+        <line x1="185.5" y1="326.75" x2="943.7734" y2="326.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_12">
+        <text transform="translate(87 318.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_13">
+        <text transform="translate(106.41 372.886)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_14">
+        <text transform="translate(121.92 289.578)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_15">
+        <path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" fill="white"/>
+        <path d="M 517.125 423.5 L 553.375 423.5 L 553.375 482 L 571.5 482 L 535.25 512 L 499 482 L 517.125 482 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="234" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="234" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <rect x="453.5" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="453.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_24">
+        <rect x="672.5" y="599.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="672.5" y="599.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_23">
+        <rect x="234" y="533" width="127" height="52.974" fill="white"/>
+        <rect x="234" y="533" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_22">
+        <rect x="375" y="533" width="310.5" height="52.974" fill="white"/>
+        <rect x="375" y="533" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="702.5" y="533" width="173.5" height="52.974" fill="white"/>
+        <rect x="702.5" y="533" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_18">
+        <line x1="185.5" y1="607.724" x2="943.7734" y2="607.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <text transform="translate(121.92 538)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_27">
+        <text transform="translate(114.8 592.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <rect x="243.06836" y="300" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(248.06836 301.068)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_30">
+        <rect x="243.06836" y="335.5" width="624.3633" height="17.5" fill="#c0ffff"/>
+        <text transform="translate(248.06836 336.568)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_32">
+        <rect x="243.06836" y="550.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(248.06836 551.805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <rect x="304" y="630.474" width="485.5" height="28.447998" fill="#c0ffff"/>
+        <text transform="translate(309 637.016)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
+        </text>
+      </g>
+      <g id="Graphic_34">
+        <text transform="translate(576.5 444.0325)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="11" xml:space="preserve">WAL replay of deltas+image below GC Horizon</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="0" y="25.336" xml:space="preserve">Reshuffle deltas</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg
@@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-104 215 863 335" width="863" height="335">
+  <defs>
+    <marker orient="auto" overflow="visible" markerUnits="strokeWidth" id="FilledArrow_Marker" stroke-linejoin="miter" stroke-miterlimit="10" viewBox="-1 -4 10 8" markerWidth="10" markerHeight="8" color="#7f8080">
+      <g>
+        <path d="M 8 0 L 0 -3 L 0 3 Z" fill="currentColor" stroke="currentColor" stroke-width="1"/>
+      </g>
+    </marker>
+  </defs>
+  <g id="03-retain-lsn" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>03-retain-lsn</title>
+    <rect fill="white" x="-104" y="215" width="863" height="335"/>
+    <g id="03-retain-lsn_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_28">
+        <rect x="48" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_27">
+        <rect x="267.5" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="486.5" y="477" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.5" y="477" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-.5" y1="387.172" x2="757.7734" y2="387.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-99 378.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_31">
+        <rect x="48.25" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48.25" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_30">
+        <rect x="267.75" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <rect x="486.75" y="410" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.75" y="410" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_34">
+        <rect x="48.25" y="431.495" width="113.75" height="34" fill="white"/>
+        <rect x="48.25" y="431.495" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_33">
+        <rect x="172.5" y="431.495" width="203.5" height="34" fill="white"/>
+        <rect x="172.5" y="431.495" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_32">
+        <rect x="386.5" y="431.495" width="303.5" height="34" fill="white"/>
+        <rect x="386.5" y="431.495" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_37">
+        <rect x="48" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="48" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <rect x="267.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="267.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_35">
+        <rect x="486.5" y="498.495" width="203.5" height="9.990005" fill="white"/>
+        <rect x="486.5" y="498.495" width="203.5" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_38">
+        <line x1="-10.48" y1="535.5395" x2="39.318294" y2="508.24794" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_39">
+        <text transform="translate(-96.984 526.3155)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 1</tspan>
+        </text>
+      </g>
+      <g id="Line_41">
+        <line x1="-10.48" y1="507.0915" x2="38.90236" y2="485.8992" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_40">
+        <text transform="translate(-96.984 497.8675)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 2</tspan>
+        </text>
+      </g>
+      <g id="Line_43">
+        <line x1="-10.48" y1="478.6435" x2="39.44267" y2="453.01616" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_42">
+        <text transform="translate(-96.984 469.4195)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
+        </text>
+      </g>
+      <g id="Line_45">
+        <line x1="-10.48" y1="448.495" x2="39.65061" y2="419.90015" marker-end="url(#FilledArrow_Marker)" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_44">
+        <text transform="translate(-96.984 439.271)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 4</tspan>
+        </text>
+      </g>
+      <g id="Graphic_46">
+        <rect x="335.46477" y="215.5" width="353.4299" height="125.495" fill="white"/>
+        <rect x="335.46477" y="215.5" width="353.4299" height="125.495" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_48">
+        <text transform="translate(549.3766 317.547)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="6536993e-19" y="15" xml:space="preserve">Dependent Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_50">
+        <text transform="translate(340.43824 317.547)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="40500936e-20" y="15" xml:space="preserve">retain_lsn 3</tspan>
+        </text>
+      </g>
+      <g id="Line_57">
+        <line x1="323.90685" y1="248.8045" x2="714.9232" y2="248.8045" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_56">
+        <text transform="translate(165.91346 240.0805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="35811354e-19" y="15" xml:space="preserve">Branch GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_58">
+        <rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
+        <rect x="493.9232" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_59">
+        <text transform="translate(358.9232 277.276)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Partial Image Coverage</tspan>
+        </text>
+      </g>
+      <g id="Graphic_60">
+        <rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" fill="white"/>
+        <rect x="354.1732" y="301.6405" width="107.45294" height="9.990005" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg
@@ -0,0 +1,187 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 426 864 366" width="864" height="366">
+  <defs/>
+  <g id="05-btmgc-parent" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>05-btmgc-parent</title>
+    <rect fill="white" x="-235" y="426" width="864" height="366"/>
+    <g id="05-btmgc-parent_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_23">
+        <rect x="-83" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-83" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-78 516.178)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="51.714" y="11" xml:space="preserve">Append C@0x30</tspan>
+        </text>
+      </g>
+      <g id="Graphic_22">
+        <rect x="136.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="355.5" y="510.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.5" y="510.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-100.448" y1="459.224" x2="626.77344" y2="459.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-230 450.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_18">
+        <rect x="-82.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-82.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77.75 432.776)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.602" y="11" xml:space="preserve">Append F@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <rect x="136.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <rect x="355.75" y="426.748" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.75" y="426.748" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_15">
+        <rect x="-82.75" y="464.645" width="113.75" height="34" fill="white"/>
+        <rect x="-82.75" y="464.645" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77.75 467.309)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.947" y="25.336" xml:space="preserve">Append D@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_14">
+        <rect x="41.5" y="464.645" width="203.5" height="34" fill="white"/>
+        <rect x="41.5" y="464.645" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_13">
+        <rect x="255.5" y="464.645" width="303.5" height="34" fill="white"/>
+        <rect x="255.5" y="464.645" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_12">
+        <rect x="-83" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-83" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-78 554.075)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="26.796" y="11" xml:space="preserve">A@0x10, Append B@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="136.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="136.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="355.5" y="548.047" width="203.5" height="26.391998" fill="white"/>
+        <rect x="355.5" y="548.047" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_24">
+        <line x1="-104" y1="542" x2="610.5" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-139.604 534.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <text transform="translate(-139.604 452.556)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-100.448" y1="481.145" x2="614.052" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-139.604 473.449)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Line_48">
+        <line x1="-99.448" y1="701.513" x2="627.77344" y2="701.513" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_47">
+        <text transform="translate(-229 692.789)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_46">
+        <rect x="-81.75" y="670.496" width="113.75" height="26.391998" fill="white"/>
+        <rect x="-81.75" y="670.496" width="113.75" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 676.524)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.727" y="11" xml:space="preserve">Append F@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_43">
+        <rect x="-81.75" y="708.393" width="113.75" height="34" fill="white"/>
+        <rect x="-81.75" y="708.393" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 718.225)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.505" y="11" xml:space="preserve">Append E@0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_37">
+        <line x1="-101" y1="777.2665" x2="613.5" y2="777.2665" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <text transform="translate(-138.604 769.7665)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <text transform="translate(-138.604 694.845)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_32">
+        <line x1="-99.448" y1="755.089" x2="615.052" y2="755.089" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_31">
+        <text transform="translate(-138.604 747.393)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_40">
+        <rect x="-82" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-82" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-77 770.7945)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_39">
+        <rect x="137.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="137.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_38">
+        <rect x="356.5" y="770.909" width="203.5" height="14.107002" fill="white"/>
+        <rect x="356.5" y="770.909" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_54">
+        <rect x="-81.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-81.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-76.75 748.421)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="62.28" y="11" xml:space="preserve">ABCD@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_53">
+        <rect x="137.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="137.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_52">
+        <rect x="356.75" y="748.5355" width="203.5" height="14.107002" fill="white"/>
+        <rect x="356.75" y="748.5355" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_57">
+        <path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" fill="white"/>
+        <path d="M 211.32422 585 L 265.17578 585 L 265.17578 611.332 L 287.84375 611.332 L 238.25 633.117 L 188.65625 611.332 L 211.32422 611.332 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_60">
+        <rect x="359" y="692.858" width="203.5" height="14.107002" fill="white"/>
+        <rect x="359" y="692.858" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_59">
+        <rect x="41.5" y="693.858" width="303" height="14.107002" fill="white"/>
+        <rect x="41.5" y="693.858" width="303" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-413 471 931 354" width="931" height="354">
+  <defs/>
+  <g id="06-btmgc-child" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>06-btmgc-child</title>
+    <rect fill="white" x="-413" y="471" width="931" height="354"/>
+    <g id="06-btmgc-child_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_47">
+        <rect x="-412" y="594.402" width="928" height="28.447998" fill="white"/>
+        <rect x="-412" y="594.402" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_46">
+        <rect x="-205" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-205" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-200 561.58)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append P@0x30</tspan>
+        </text>
+      </g>
+      <g id="Graphic_45">
+        <rect x="14.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_44">
+        <rect x="233.5" y="555.552" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.5" y="555.552" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_43">
+        <line x1="-222.448" y1="504.724" x2="504.77344" y2="504.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_42">
+        <text transform="translate(-352 496)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_41">
+        <rect x="-204.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-204.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199.75 478.178)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.158" y="11" xml:space="preserve">Append S@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_40">
+        <rect x="14.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_39">
+        <rect x="233.75" y="472.15" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.75" y="472.15" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_38">
+        <rect x="-204.75" y="510.047" width="113.75" height="34" fill="white"/>
+        <rect x="-204.75" y="510.047" width="113.75" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199.75 512.711)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="7.061" y="11" xml:space="preserve">Append R@0x50</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="6.611" y="25.336" xml:space="preserve">Append Q@0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_37">
+        <rect x="-80.5" y="510.047" width="203.5" height="34" fill="white"/>
+        <rect x="-80.5" y="510.047" width="203.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_36">
+        <rect x="133.5" y="510.047" width="303.5" height="34" fill="white"/>
+        <rect x="133.5" y="510.047" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_33">
+        <text transform="translate(-261.604 498.056)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-224" y1="607.9115" x2="490.5" y2="607.9115" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-261.604 600.4115)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <rect x="-205" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-205" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-200 601.4395)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="70.836" y="11" xml:space="preserve">AB@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_27">
+        <rect x="14.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="14.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <rect x="233.5" y="601.554" width="203.5" height="14.107002" fill="white"/>
+        <rect x="233.5" y="601.554" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-407 599.1875)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_24">
+        <rect x="-411" y="795.46" width="928" height="28.447998" fill="white"/>
+        <rect x="-411" y="795.46" width="928" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-221.448" y1="755.528" x2="505.77344" y2="755.528" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-351 746.804)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_18">
+        <rect x="-203.75" y="723.579" width="203.25" height="26.391998" fill="white"/>
+        <rect x="-203.75" y="723.579" width="203.25" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-198.75 729.607)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" x="52.033" y="11" xml:space="preserve">Append S@0x60</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <text transform="translate(-260.604 748.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Line_7">
+        <line x1="-223" y1="808.9695" x2="491.5" y2="808.9695" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_6">
+        <text transform="translate(-260.604 801.4695)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_5">
+        <rect x="-204" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="-204" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199 802.4975)" fill="#b1001c">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="70.836" y="11" xml:space="preserve">AB</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">@0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_4">
+        <rect x="15.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="15.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <rect x="234.5" y="802.612" width="203.5" height="14.107002" fill="white"/>
+        <rect x="234.5" y="802.612" width="203.5" height="14.107002" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_2">
+        <text transform="translate(-406 800.2455)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">Ancestor Branch</tspan>
+        </text>
+      </g>
+      <g id="Graphic_48">
+        <path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" fill="white"/>
+        <path d="M 89.32422 639.081 L 143.17578 639.081 L 143.17578 665.413 L 165.84375 665.413 L 116.25 687.198 L 66.65625 665.413 L 89.32422 665.413 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_49">
+        <rect x="-204" y="762.428" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-204" y="762.428" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-199 768.456)" fill="#b1001c">
+          <tspan font-family="Helvetica Neue" font-size="12" fill="#b1001c" x="58.278" y="11" xml:space="preserve">AB</tspan>
+          <tspan font-family="Helvetica Neue" font-size="12" fill="black" y="11" xml:space="preserve">PQR@0x50</tspan>
+        </text>
+      </g>
+      <g id="Graphic_59">
+        <rect x="14.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
+        <rect x="14.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_58">
+        <rect x="233.5" y="723.579" width="203.5" height="26.391998" fill="white"/>
+        <rect x="233.5" y="723.579" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_63">
+        <rect x="9" y="762.085" width="203.5" height="26.391998" fill="white"/>
+        <rect x="9" y="762.085" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_62">
+        <rect x="225" y="762.085" width="213" height="26.391998" fill="white"/>
+        <rect x="225" y="762.085" width="213" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg
@@ -0,0 +1,180 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-556 476 923 411" width="923" height="411">
+  <defs/>
+  <g id="07-btmgc-analysis-1" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>07-btmgc-analysis-1</title>
+    <rect fill="white" x="-556" y="476" width="923" height="411"/>
+    <g id="07-btmgc-analysis-1_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_85">
+        <rect x="-404" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="-404" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_84">
+        <rect x="-184.5" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_83">
+        <rect x="34.5" y="609.062" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.5" y="609.062" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_82">
+        <rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_81">
+        <rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_80">
+        <rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_79">
+        <rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
+        <rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_77">
+        <line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <text transform="translate(-551 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_75">
+        <text transform="translate(-531.59 602.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.39" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="29132252e-19" y="28.447998" xml:space="preserve">at earlier LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_74">
+        <text transform="translate(-516.08 481)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="-403.8" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="-403.8" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="-184.3" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.3" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="34.7" y="827.474" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.7" y="827.474" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="-403.8" y="761" width="127" height="52.974" fill="white"/>
+        <rect x="-403.8" y="761" width="127" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="-262.8" y="761" width="310.5" height="52.974" fill="white"/>
+        <rect x="-262.8" y="761" width="310.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_67">
+        <rect x="64.7" y="761" width="173.5" height="52.974" fill="white"/>
+        <rect x="64.7" y="761" width="173.5" height="52.974" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_66">
+        <line x1="-452.3" y1="835.724" x2="251.2" y2="835.724" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_65">
+        <text transform="translate(-515.88 766)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8739676e-19" y="15" xml:space="preserve">Deltas</tspan>
+        </text>
+      </g>
+      <g id="Graphic_64">
+        <text transform="translate(-523 820.86)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="3488765e-18" y="15" xml:space="preserve">Images </tspan>
+          <tspan font-family="Helvetica Neue" font-size="10" fill="black" x="4.01" y="28.447998" xml:space="preserve">at GC LSN</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <rect x="-394.93164" y="491.422" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(-389.93164 492.49)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_62">
+        <rect x="-394.93164" y="526.922" width="624.3633" height="17.5" fill="#c0ffff"/>
+        <text transform="translate(-389.93164 527.99)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.89414" y="12" xml:space="preserve">Deltas below GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_61">
+        <rect x="-394.73164" y="778.737" width="624.3633" height="17.5" fill="#c0ffc0"/>
+        <text transform="translate(-389.73164 779.805)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="233.52364" y="12" xml:space="preserve">Deltas above GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_60">
+        <rect x="-333.8" y="858.474" width="485.5" height="28.447998" fill="#c0ffff"/>
+        <text transform="translate(-328.8 865.016)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="13" fill="black" x="63.095" y="12" xml:space="preserve">Deltas and image below GC Horizon gets garbage-collected</tspan>
+        </text>
+      </g>
+      <g id="Graphic_86">
+        <text transform="translate(263 499.724)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="32" fill="black" x="0" y="30" xml:space="preserve">size=A</tspan>
+        </text>
+      </g>
+      <g id="Line_87">
+        <line x1="260.87012" y1="479.068" x2="360.71387" y2="479.068" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_88">
+        <line x1="260.87012" y1="561" x2="360.71387" y2="561" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_89">
+        <rect x="-403.8" y="569" width="161.8" height="28.447998" fill="white"/>
+        <rect x="-403.8" y="569" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_90">
+        <rect x="-229.5" y="569.018" width="277.2" height="28.447998" fill="white"/>
+        <rect x="-229.5" y="569.018" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_91">
+        <rect x="64.7" y="569.018" width="173.5" height="28.447998" fill="white"/>
+        <rect x="64.7" y="569.018" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_92">
+        <line x1="262" y1="602" x2="361.84375" y2="602" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_93">
+        <line x1="263" y1="625.562" x2="362.84375" y2="625.562" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_94">
+        <text transform="translate(264.53787 562.276)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="32" fill="black" x="14210855e-21" y="30" xml:space="preserve">size=B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_95">
+        <text transform="translate(285.12 599.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="24" fill="black" x="0" y="23" xml:space="preserve">size=C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_98">
+        <text transform="translate(264.53787 773.772)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="8881784e-19" y="25" xml:space="preserve">A</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+      <g id="Graphic_97">
+        <text transform="translate(265.87013 815.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg
@@ -0,0 +1,158 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-235 406 586 424" width="586" height="424">
+  <defs/>
+  <g id="08-optimization" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>08-optimization</title>
+    <rect fill="white" x="-235" y="406" width="586" height="424"/>
+    <g id="08-optimization_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_22">
+        <rect x="-100.448" y="509.902" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_21">
+        <rect x="118.552" y="509.902" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="509.902" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_20">
+        <line x1="-101.79572" y1="420.322" x2="349.5" y2="420.322" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_19">
+        <text transform="translate(-230 411.598)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <rect x="-100.198" y="426.5" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_16">
+        <rect x="118.802" y="426.5" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.802" y="426.5" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_14">
+        <rect x="-100.198" y="464.397" width="108.25" height="34" fill="white"/>
+        <rect x="-100.198" y="464.397" width="108.25" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_13">
+        <rect x="18.552" y="464.397" width="303.5" height="34" fill="white"/>
+        <rect x="18.552" y="464.397" width="303.5" height="34" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_11">
+        <rect x="-100.448" y="547.799" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="118.552" y="547.799" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="547.799" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_24">
+        <line x1="-104" y1="542" x2="339.4011" y2="542" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_25">
+        <text transform="translate(-139.604 534.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Line_27">
+        <line x1="-101.79572" y1="459.098" x2="341.6054" y2="459.098" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-139.604 451.402)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+      <g id="Graphic_28">
+        <text transform="translate(-139.604 413.654)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x60</tspan>
+        </text>
+      </g>
+      <g id="Line_30">
+        <line x1="-101.79572" y1="481.145" x2="341.6054" y2="481.145" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_29">
+        <text transform="translate(-139.604 473.449)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_77">
+        <rect x="-100.448" y="765.19595" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <rect x="118.552" y="765.19595" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="765.19595" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_75">
+        <line x1="-101.79572" y1="637.317" x2="349.5" y2="637.317" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_74">
+        <text transform="translate(-230 628.593)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <rect x="-100.198" y="681.794" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="118.802" y="681.794" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.802" y="681.794" width="203.5" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="-100.198" y="719.69096" width="108.25" height="34" fill="white"/>
+        <rect x="-100.198" y="719.69096" width="108.25" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="18.552" y="719.69096" width="303.5" height="34" fill="white"/>
+        <rect x="18.552" y="719.69096" width="303.5" height="34" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="-100.448" y="803.09295" width="203.5" height="26.391998" fill="white"/>
+        <rect x="-100.448" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="118.552" y="803.09295" width="203.5" height="26.391998" fill="white"/>
+        <rect x="118.552" y="803.09295" width="203.5" height="26.391998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_67">
+        <line x1="-104" y1="797.294" x2="339.4011" y2="797.294" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_66">
+        <text transform="translate(-139.604 789.794)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x20</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <text transform="translate(-139.604 630.649)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x70</tspan>
+        </text>
+      </g>
+      <g id="Line_62">
+        <line x1="-101.79572" y1="736.439" x2="341.6054" y2="736.439" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="4.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_61">
+        <text transform="translate(-139.604 728.743)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="black" x="0" y="13" xml:space="preserve">0x40</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="-100.198" y="644.393" width="168.198" height="26.391998" fill="white"/>
+        <rect x="-100.198" y="644.393" width="168.198" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="80" y="644.393" width="242.302" height="26.391998" fill="white"/>
+        <rect x="80" y="644.393" width="242.302" height="26.391998" stroke="#b1001c" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_81">
+        <line x1="-101.79572" y1="714.139" x2="341.6054" y2="714.139" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-dasharray="1.0,4.0" stroke-width="1"/>
+      </g>
+      <g id="Graphic_80">
+        <text transform="translate(-139.604 706.443)" fill="#a5a5a5">
+          <tspan font-family="Helvetica Neue" font-size="14" fill="#a5a5a5" x="0" y="13" xml:space="preserve">0x50</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg
@@ -0,0 +1,184 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-562 479 876 429" width="876" height="429">
+  <defs/>
+  <g id="09-btmgc-analysis-2" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>09-btmgc-analysis-2</title>
+    <rect fill="white" x="-562" y="479" width="876" height="429"/>
+    <g id="09-btmgc-analysis-2_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_85">
+        <rect x="-404" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="-404" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-399 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_84">
+        <rect x="-184.5" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="-184.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-179.5 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_83">
+        <rect x="34.5" y="622.386" width="203.5" height="17.5" fill="white"/>
+        <rect x="34.5" y="622.386" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(39.5 621.912)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_82">
+        <rect x="-404" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-404" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-399 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_81">
+        <rect x="-263" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-263" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-258 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_80">
+        <rect x="-122" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="-122" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-117 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="19" y="479.922" width="127" height="77.5" fill="white"/>
+        <rect x="19" y="479.922" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(24 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_78">
+        <rect x="160" y="479.922" width="78" height="77.5" fill="white"/>
+        <rect x="160" y="479.922" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(165 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Line_77">
+        <line x1="-452.5" y1="518.172" x2="251" y2="518.172" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_76">
+        <text transform="translate(-551 509.448)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_73">
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" fill="white"/>
+        <path d="M -120.675 651.5 L -84.425 651.5 L -84.425 710 L -66.3 710 L -102.55 740 L -138.8 710 L -120.675 710 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_89">
+        <rect x="-403.8" y="582.324" width="161.8" height="28.447998" fill="white"/>
+        <rect x="-403.8" y="582.324" width="161.8" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-398.8 587.324)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="70.42" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_90">
+        <rect x="-229.5" y="582.342" width="277.2" height="28.447998" fill="white"/>
+        <rect x="-229.5" y="582.342" width="277.2" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-224.5 587.342)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="128.12" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_91">
+        <rect x="64.7" y="582.342" width="173.5" height="28.447998" fill="white"/>
+        <rect x="64.7" y="582.342" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(69.7 587.342)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_97">
+        <rect x="-403.8" y="564.842" width="490.8" height="12.157997" fill="white"/>
+        <rect x="-403.8" y="564.842" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-398.8 561.697)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_109">
+        <rect x="28.6" y="889.964" width="203.5" height="17.5" fill="white"/>
+        <rect x="28.6" y="889.964" width="203.5" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(33.6 889.49)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="90.974" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_108">
+        <rect x="-409.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-409.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-404.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_107">
+        <rect x="-268.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-268.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-263.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_106">
+        <rect x="-127.9" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="-127.9" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-122.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_105">
+        <rect x="13.1" y="747.5" width="127" height="77.5" fill="white"/>
+        <rect x="13.1" y="747.5" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(18.1 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="53.316" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Graphic_104">
+        <rect x="154.1" y="747.5" width="78" height="77.5" fill="white"/>
+        <rect x="154.1" y="747.5" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(159.1 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="28.816" y="15" xml:space="preserve">A</tspan>
+        </text>
+      </g>
+      <g id="Line_103">
+        <line x1="-458.4" y1="785.75" x2="245.1" y2="785.75" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_102">
+        <text transform="translate(-556.9 777.026)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_99">
+        <rect x="58.8" y="849.92" width="173.5" height="28.447998" fill="white"/>
+        <rect x="58.8" y="849.92" width="173.5" height="28.447998" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(63.8 854.92)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="76.27" y="15" xml:space="preserve">B</tspan>
+        </text>
+      </g>
+      <g id="Graphic_98">
+        <rect x="-409.7" y="832.42" width="490.8" height="12.157997" fill="white"/>
+        <rect x="-409.7" y="832.42" width="490.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(-404.7 829.275)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="234.624" y="15" xml:space="preserve">C</tspan>
+        </text>
+      </g>
+      <g id="Graphic_112">
+        <text transform="translate(273 797.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="6536993e-19" y="25" xml:space="preserve">B</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+      <g id="Graphic_113">
+        <text transform="translate(273 833.974)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="26" fill="black" x="42277293e-20" y="25" xml:space="preserve">C</tspan>
+          <tspan font-family="Lucida Grande" font-size="26" fill="black" y="25" xml:space="preserve">↓</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
+  <defs/>
+  <g id="10-btmgc-analysis-3" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>10-btmgc-analysis-3</title>
+    <rect fill="white" x="-12" y="920" width="809" height="269"/>
+    <g id="10-btmgc-analysis-3_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_13">
+        <rect x="433.7" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="433.7" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(438.7 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_12">
+        <rect x="503.7654" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="503.7654" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(508.7654 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="574.8318" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="574.8318" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(579.8318 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <rect x="645.3977" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="645.3977" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(650.3977 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+      <g id="Line_8">
+        <line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <text transform="translate(-6.500003 925.552)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_2">
+        <rect x="113.2" y="1033.92" width="321.3" height="12.157997" fill="white"/>
+        <rect x="113.2" y="1033.92" width="321.3" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118.2 1030.775)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="150.762" y="15" xml:space="preserve">X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_27">
+        <line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-5.5000034 1155.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_25">
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(119 1170.355)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="306.564" y="15" xml:space="preserve">2X</tspan>
+        </text>
+      </g>
+      <g id="Graphic_33">
+        <rect x="715.96355" y="949" width="63.559346" height="77.5" fill="white"/>
+        <rect x="715.96355" y="949" width="63.559346" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(720.96355 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="8.107673" y="15" xml:space="preserve">1/5 X</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" version="1.1" viewBox="-12 920 809 269" width="809" height="269">
+  <defs/>
+  <g id="11-btmgc-analysis-4" stroke-opacity="1" stroke-dasharray="none" stroke="none" fill="none" fill-opacity="1">
+    <title>11-btmgc-analysis-4</title>
+    <rect fill="white" x="-12" y="920" width="809" height="269"/>
+    <g id="11-btmgc-analysis-4_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_13">
+        <rect x="113" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="113" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_12">
+        <rect x="253" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="253" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(258 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_11">
+        <rect x="395" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="395" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(400 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_10">
+        <rect x="536" y="949" width="127" height="77.5" fill="white"/>
+        <rect x="536" y="949" width="127" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(541 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="39.084" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_9">
+        <rect x="677" y="949" width="78" height="77.5" fill="white"/>
+        <rect x="677" y="949" width="78" height="77.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(682 978.526)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="14.584" y="15" xml:space="preserve">1/5 D</tspan>
+        </text>
+      </g>
+      <g id="Line_8">
+        <line x1="92" y1="934.276" x2="795.5" y2="934.276" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <text transform="translate(-6.500003 925.552)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_2">
+        <rect x="113.2" y="1033.92" width="641.8" height="12.157997" fill="white"/>
+        <rect x="113.2" y="1033.92" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(118.2 1030.775)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
+        </text>
+      </g>
+      <g id="Graphic_17">
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" fill="white"/>
+        <path d="M 420.125 1062 L 456.375 1062 L 456.375 1120.5 L 474.5 1120.5 L 438.25 1150.5 L 402 1120.5 L 420.125 1120.5 Z" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_27">
+        <line x1="93" y1="1164.224" x2="796.5" y2="1164.224" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_26">
+        <text transform="translate(-5.5000034 1155.5)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">GC Horizon</tspan>
+        </text>
+      </g>
+      <g id="Graphic_25">
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" fill="white"/>
+        <rect x="114" y="1173.5" width="641.8" height="12.157997" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(119 1170.355)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="310.268" y="15" xml:space="preserve">D</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png
--- a/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg
+++ b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns="http://www.w3.org/2000/svg" xmlns:xl="http://www.w3.org/1999/xlink" viewBox="210 271 870 514" width="870" height="514">
+  <defs/>
+  <g id="gc-compaction-split" stroke-dasharray="none" fill-opacity="1" stroke="none" fill="none" stroke-opacity="1">
+    <title>gc-compaction-split</title>
+    <rect fill="white" x="210" y="271" width="870" height="514"/>
+    <g id="gc-compaction-split_Layer_1">
+      <title>Layer 1</title>
+      <g id="Graphic_12">
+        <rect x="241" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="241" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_11">
+        <rect x="468.72266" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="468.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_10">
+        <rect x="695.72266" y="272" width="213" height="50.5" fill="white"/>
+        <rect x="695.72266" y="272" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_9">
+        <rect x="241" y="337.3711" width="303.5" height="50.5" fill="white"/>
+        <rect x="241" y="337.3711" width="303.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_8">
+        <rect x="556.2617" y="337.3711" width="352.46094" height="50.5" fill="white"/>
+        <rect x="556.2617" y="337.3711" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_7">
+        <rect x="241" y="402.7422" width="667.72266" height="50.5" fill="white"/>
+        <rect x="241" y="402.7422" width="667.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_6">
+        <line x1="211" y1="355.5" x2="947.4961" y2="355.5" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_5">
+        <text transform="translate(952.4961 346.776)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
+        </text>
+      </g>
+      <g id="Line_4">
+        <line x1="212" y1="438.5182" x2="948.4961" y2="438.5182" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_3">
+        <text transform="translate(953.4961 429.7942)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
+        </text>
+      </g>
+      <g id="Graphic_13">
+        <rect x="241" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(246 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 1</tspan>
+        </text>
+      </g>
+      <g id="Graphic_57">
+        <rect x="359" y="647.96484" width="551.72266" height="50.5" fill="white"/>
+        <rect x="359" y="647.96484" width="551.72266" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_54">
+        <rect x="359" y="517.22266" width="96" height="50.5" fill="white"/>
+        <rect x="359" y="517.22266" width="96" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_53">
+        <rect x="469.72266" y="517.22266" width="213" height="50.5" fill="white"/>
+        <rect x="469.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_52">
+        <rect x="696.72266" y="517.22266" width="213" height="50.5" fill="white"/>
+        <rect x="696.72266" y="517.22266" width="213" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_51">
+        <rect x="359" y="582.59375" width="186.5" height="50.5" fill="white"/>
+        <rect x="359" y="582.59375" width="186.5" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_50">
+        <rect x="557.2617" y="582.59375" width="352.46094" height="50.5" fill="white"/>
+        <rect x="557.2617" y="582.59375" width="352.46094" height="50.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Line_49">
+        <line x1="212" y1="600.72266" x2="948.4961" y2="600.72266" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_48">
+        <text transform="translate(953.4961 591.99866)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">branch point</tspan>
+        </text>
+      </g>
+      <g id="Line_47">
+        <line x1="213" y1="683.74084" x2="949.4961" y2="683.74084" stroke="#7f8080" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_46">
+        <text transform="translate(954.4961 675.01685)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="0" y="15" xml:space="preserve">last branch point</tspan>
+        </text>
+      </g>
+      <g id="Graphic_63">
+        <rect x="376.72525" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(381.72525 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 2</tspan>
+        </text>
+      </g>
+      <g id="Graphic_64">
+        <rect x="511.39405" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(516.39405 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 3</tspan>
+        </text>
+      </g>
+      <g id="Graphic_65">
+        <rect x="646.06285" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(651.06285 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 4</tspan>
+        </text>
+      </g>
+      <g id="Graphic_66">
+        <rect x="780.73165" y="272" width="127.99101" height="181.24219" fill="#3a8eed" fill-opacity=".5"/>
+        <text transform="translate(785.73165 353.3971)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="38.835502" y="15" xml:space="preserve">Job 5</tspan>
+        </text>
+      </g>
+      <g id="Graphic_56">
+        <rect x="243.5" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="243.5" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_55">
+        <rect x="243.5" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="243.5" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_68">
+        <rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="379.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_67">
+        <rect x="379.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="379.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_70">
+        <rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="514.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_69">
+        <rect x="514.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="514.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_72">
+        <rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="649.22525" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_71">
+        <rect x="649.22525" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="649.22525" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_74">
+        <rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" fill="#ccc"/>
+        <rect x="785.23165" y="517.22266" width="125.49101" height="181.24219" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_73">
+        <rect x="785.23165" y="673.46484" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="785.23165" y="673.46484" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+      </g>
+      <g id="Graphic_78">
+        <rect x="241" y="731.3359" width="125.49101" height="27.26953" fill="#ccc"/>
+        <rect x="241" y="731.3359" width="125.49101" height="27.26953" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(246 735.7467)" fill="black">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="black" x="17.297502" y="15" xml:space="preserve">Delta Layer</tspan>
+        </text>
+      </g>
+      <g id="Graphic_79">
+        <rect x="241" y="766.759" width="125.49101" height="17.5" fill="#6b7ca5"/>
+        <rect x="241" y="766.759" width="125.49101" height="17.5" stroke="black" stroke-linecap="round" stroke-linejoin="round" stroke-width="1"/>
+        <text transform="translate(246 766.285)" fill="white">
+          <tspan font-family="Helvetica Neue" font-size="16" fill="white" x="13.737502" y="15" xml:space="preserve">Image Layer</tspan>
+        </text>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        if var(REAL_S3_ENV).is_ok() {
            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
        }
+
+        #[cfg(target_os = "linux")]
        assert!(body.contains("process_threads"));
    }

--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "neon-shmem"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+nix.workspace=true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "macos")'.dependencies]
+tempfile = "3.14.0"
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -0,0 +1,418 @@
+//! Shared memory utilities for neon communicator
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -183,6 +183,8 @@ pub struct ConfigToml {
    pub enable_tls_page_service_api: bool,
    pub dev_mode: bool,
    pub timeline_import_config: TimelineImportConfig,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -235,7 +237,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
    ScatteredLsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
@@ -305,6 +307,27 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
 pub struct TimelineImportConfig {
    pub import_job_concurrency: NonZeroUsize,
    pub import_job_soft_size_limit: NonZeroUsize,
+    pub import_job_checkpoint_threshold: NonZeroUsize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(default)]
+pub struct BasebackupCacheConfig {
+    #[serde(with = "humantime_serde")]
+    pub cleanup_period: Duration,
+    // FIXME: Support max_size_bytes.
+    // pub max_size_bytes: usize,
+    pub max_size_entries: i64,
+}
+
+impl Default for BasebackupCacheConfig {
+    fn default() -> Self {
+        Self {
+            cleanup_period: Duration::from_secs(60),
+            // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
+            max_size_entries: 1000,
+        }
+    }
 }

 pub mod statvfs {
@@ -490,6 +513,14 @@ pub struct TenantConfigToml {
    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
    /// that will get perf sampling for the tenant.
    pub sampling_ratio: Option<Ratio>,
+
+    /// Capacity of relsize snapshot cache (used by replicas).
+    pub relsize_snapshot_cache_capacity: usize,
+
+    /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
+    // FIXME: Remove skip_serializing_if when the feature is stable.
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    pub basebackup_cache_enabled: bool,
 }

 pub mod defaults {
@@ -639,23 +670,15 @@ impl Default for ConfigToml {
            tenant_config: TenantConfigToml::default(),
            no_sync: None,
            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: if !cfg!(test) {
-                PageServicePipeliningConfig::Serial
-            } else {
-                // Do not turn this into the default until scattered reads have been
-                // validated and rolled-out fully.
-                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            page_service_pipelining: PageServicePipeliningConfig::Pipelined(
+                PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
-                })
-            },
-            get_vectored_concurrent_io: if !cfg!(test) {
-                GetVectoredConcurrentIo::Sequential
-            } else {
-                GetVectoredConcurrentIo::SidecarTask
-            },
-            enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
+                },
+            ),
+            get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask,
+            enable_read_path_debugging: if cfg!(feature = "testing") {
                Some(true)
            } else {
                None
@@ -669,7 +692,9 @@ impl Default for ConfigToml {
            timeline_import_config: TimelineImportConfig {
                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
            },
+            basebackup_cache_config: None,
        }
    }
 }
@@ -736,6 +761,7 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
+    pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
 }

 impl Default for TenantConfigToml {
@@ -793,6 +819,8 @@ impl Default for TenantConfigToml {
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
            sampling_ratio: None,
+            relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
+            basebackup_cache_enabled: false,
        }
    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -910,6 +910,11 @@ impl Key {
        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
    }

+    #[inline(always)]
+    pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool {
+        self.is_rel_block_key() && self.field4 == rel
+    }
+
    #[inline(always)]
    pub fn is_rel_dir_key(&self) -> bool {
        self.field1 == 0x00
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -336,14 +336,30 @@ impl TimelineCreateRequest {

 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
 pub enum ShardImportStatus {
-    InProgress,
+    InProgress(Option<ShardImportProgress>),
    Done,
    Error(String),
 }
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum ShardImportProgress {
+    V1(ShardImportProgressV1),
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardImportProgressV1 {
+    /// Total number of jobs in the import plan
+    pub jobs: usize,
+    /// Number of jobs completed
+    pub completed: usize,
+    /// Hash of the plan
+    pub import_plan_hash: u64,
+}
+
 impl ShardImportStatus {
    pub fn is_terminal(&self) -> bool {
        match self {
-            ShardImportStatus::InProgress => false,
+            ShardImportStatus::InProgress(_) => false,
            ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
        }
    }
@@ -614,6 +630,10 @@ pub struct TenantConfigPatch {
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub sampling_ratio: FieldPatch<Option<Ratio>>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub basebackup_cache_enabled: FieldPatch<bool>,
 }

 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -743,6 +763,12 @@ pub struct TenantConfig {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub sampling_ratio: Option<Option<Ratio>>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub relsize_snapshot_cache_capacity: Option<usize>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub basebackup_cache_enabled: Option<bool>,
 }

 impl TenantConfig {
@@ -788,6 +814,8 @@ impl TenantConfig {
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
            mut sampling_ratio,
+            mut relsize_snapshot_cache_capacity,
+            mut basebackup_cache_enabled,
        } = self;

        patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -889,6 +917,12 @@ impl TenantConfig {
            .gc_compaction_ratio_percent
            .apply(&mut gc_compaction_ratio_percent);
        patch.sampling_ratio.apply(&mut sampling_ratio);
+        patch
+            .relsize_snapshot_cache_capacity
+            .apply(&mut relsize_snapshot_cache_capacity);
+        patch
+            .basebackup_cache_enabled
+            .apply(&mut basebackup_cache_enabled);

        Ok(Self {
            checkpoint_distance,
@@ -928,6 +962,8 @@ impl TenantConfig {
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
            sampling_ratio,
+            relsize_snapshot_cache_capacity,
+            basebackup_cache_enabled,
        })
    }

@@ -1036,6 +1072,12 @@ impl TenantConfig {
                .gc_compaction_ratio_percent
                .unwrap_or(global_conf.gc_compaction_ratio_percent),
            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
+            relsize_snapshot_cache_capacity: self
+                .relsize_snapshot_cache_capacity
+                .unwrap_or(global_conf.relsize_snapshot_cache_capacity),
+            basebackup_cache_enabled: self
+                .basebackup_cache_enabled
+                .unwrap_or(global_conf.basebackup_cache_enabled),
        }
    }
 }
@@ -1803,7 +1845,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::sync::LazyLock;

    #[derive(
        Copy,
@@ -1851,15 +1892,7 @@ pub mod virtual_file {

    impl IoMode {
        pub fn preferred() -> Self {
-            // The default behavior when running Rust unit tests without any further
-            // flags is to use the newest behavior (DirectRw).
-            // The CI uses the environment variable to unit tests for all different modes.
-            // NB: the Python regression & perf tests have their own defaults management
-            // that writes pageserver.toml; they do not use this variable.
-            static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
-                utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
-            });
-            ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
+            IoMode::DirectRw
        }
    }

--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
+use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};

 use crate::controller_api::NodeRegisterRequest;
@@ -63,9 +64,17 @@ pub struct ValidateResponseTenant {
    pub valid: bool,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TimelineImportStatusRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub generation: Generation,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct PutTimelineImportStatusRequest {
    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    pub status: ShardImportStatus,
+    pub generation: Generation,
 }
--- a/libs/pageserver_api/src/value.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -36,6 +36,24 @@ impl Value {
            Value::WalRecord(rec) => rec.will_init(),
        }
    }
+
+    #[inline(always)]
+    pub fn estimated_size(&self) -> usize {
+        match self {
+            Value::Image(image) => image.len(),
+            Value::WalRecord(NeonWalRecord::AuxFile {
+                content: Some(content),
+                ..
+            }) => content.len(),
+            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
+            Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4,
+            Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4,
+            Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => {
+                members.len() * 8
+            }
+            _ => 8192, /* use image size as the estimation */
+        }
+    }
 }

 #[derive(Debug, PartialEq)]
--- a/libs/posthog_client_lite/Cargo.toml
+++ b/libs/posthog_client_lite/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "posthog_client_lite"
+version = "0.1.0"
+edition = "2024"
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+reqwest.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+sha2.workspace = true
+workspace_hack.workspace = true
+thiserror.workspace = true
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -0,0 +1,634 @@
+//! A lite version of the PostHog client that only supports local evaluation of feature flags.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use sha2::Digest;
+
+#[derive(Debug, thiserror::Error)]
+pub enum PostHogEvaluationError {
+    /// The feature flag is not available, for example, because the local evaluation data is not populated yet.
+    #[error("Feature flag not available: {0}")]
+    NotAvailable(String),
+    #[error("No condition group is matched")]
+    NoConditionGroupMatched,
+    /// Real errors, e.g., the rollout percentage does not add up to 100.
+    #[error("Failed to evaluate feature flag: {0}")]
+    Internal(String),
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationResponse {
+    #[allow(dead_code)]
+    flags: Vec<LocalEvaluationFlag>,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlag {
+    key: String,
+    filters: LocalEvaluationFlagFilters,
+    active: bool,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilters {
+    groups: Vec<LocalEvaluationFlagFilterGroup>,
+    multivariate: LocalEvaluationFlagMultivariate,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilterGroup {
+    variant: Option<String>,
+    properties: Option<Vec<LocalEvaluationFlagFilterProperty>>,
+    rollout_percentage: i64,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagFilterProperty {
+    key: String,
+    value: PostHogFlagFilterPropertyValue,
+    operator: String,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+#[serde(untagged)]
+pub enum PostHogFlagFilterPropertyValue {
+    String(String),
+    Number(f64),
+    Boolean(bool),
+    List(Vec<String>),
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagMultivariate {
+    variants: Vec<LocalEvaluationFlagMultivariateVariant>,
+}
+
+#[derive(Deserialize)]
+pub struct LocalEvaluationFlagMultivariateVariant {
+    key: String,
+    rollout_percentage: i64,
+}
+
+pub struct FeatureStore {
+    flags: HashMap<String, LocalEvaluationFlag>,
+}
+
+impl Default for FeatureStore {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+enum GroupEvaluationResult {
+    MatchedAndOverride(String),
+    MatchedAndEvaluate,
+    Unmatched,
+}
+
+impl FeatureStore {
+    pub fn new() -> Self {
+        Self {
+            flags: HashMap::new(),
+        }
+    }
+
+    pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
+        self.flags.clear();
+        for flag in flags {
+            self.flags.insert(flag.key.clone(), flag);
+        }
+    }
+
+    /// Generate a consistent hash for a user ID (e.g., tenant ID).
+    ///
+    /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`.
+    /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a
+    /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`.
+    fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 {
+        let mut hasher = sha2::Sha256::new();
+        hasher.update(user_id);
+        hasher.update(".");
+        hasher.update(flag_key);
+        hasher.update(".");
+        hasher.update(salt);
+        let hash = hasher.finalize();
+        let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap());
+        hash_int as f64 / u64::MAX as f64
+    }
+
+    /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing
+    /// property.
+    fn evaluate_condition(
+        &self,
+        operator: &str,
+        provided: &PostHogFlagFilterPropertyValue,
+        requested: &PostHogFlagFilterPropertyValue,
+    ) -> Result<bool, PostHogEvaluationError> {
+        match operator {
+            "exact" => {
+                let PostHogFlagFilterPropertyValue::String(provided) = provided else {
+                    // Left should be a string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The left side of the condition is not a string: {:?}",
+                        provided
+                    )));
+                };
+                let PostHogFlagFilterPropertyValue::List(requested) = requested else {
+                    // Right should be a list of string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The right side of the condition is not a list: {:?}",
+                        requested
+                    )));
+                };
+                Ok(requested.contains(provided))
+            }
+            "lt" | "gt" => {
+                let PostHogFlagFilterPropertyValue::String(requested) = requested else {
+                    // Right should be a string
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "The right side of the condition is not a string: {:?}",
+                        requested
+                    )));
+                };
+                let Ok(requested) = requested.parse::<f64>() else {
+                    return Err(PostHogEvaluationError::Internal(format!(
+                        "Can not parse the right side of the condition as a number: {:?}",
+                        requested
+                    )));
+                };
+                // Left can either be a number or a string
+                let provided = match provided {
+                    PostHogFlagFilterPropertyValue::Number(provided) => *provided,
+                    PostHogFlagFilterPropertyValue::String(provided) => {
+                        let Ok(provided) = provided.parse::<f64>() else {
+                            return Err(PostHogEvaluationError::Internal(format!(
+                                "Can not parse the left side of the condition as a number: {:?}",
+                                provided
+                            )));
+                        };
+                        provided
+                    }
+                    _ => {
+                        return Err(PostHogEvaluationError::Internal(format!(
+                            "The left side of the condition is not a number or a string: {:?}",
+                            provided
+                        )));
+                    }
+                };
+                match operator {
+                    "lt" => Ok(provided < requested),
+                    "gt" => Ok(provided > requested),
+                    op => Err(PostHogEvaluationError::Internal(format!(
+                        "Unsupported operator: {}",
+                        op
+                    ))),
+                }
+            }
+            _ => Err(PostHogEvaluationError::Internal(format!(
+                "Unsupported operator: {}",
+                operator
+            ))),
+        }
+    }
+
+    /// Evaluate a percentage.
+    fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool {
+        mapped_user_id <= percentage as f64 / 100.0
+    }
+
+    /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation.
+    ///
+    /// Return values:
+    /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value
+    /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage
+    /// Ok(GroupEvaluationResult::Unmatched): condition unmatched
+    fn evaluate_group(
+        &self,
+        group: &LocalEvaluationFlagFilterGroup,
+        hash_on_group_rollout_percentage: f64,
+        provided_properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<GroupEvaluationResult, PostHogEvaluationError> {
+        if let Some(ref properties) = group.properties {
+            for property in properties {
+                if let Some(value) = provided_properties.get(&property.key) {
+                    // The user provided the property value
+                    if !self.evaluate_condition(
+                        property.operator.as_ref(),
+                        value,
+                        &property.value,
+                    )? {
+                        return Ok(GroupEvaluationResult::Unmatched);
+                    }
+                } else {
+                    // We cannot evaluate, the property is not available
+                    return Err(PostHogEvaluationError::NotAvailable(format!(
+                        "The required property in the condition is not available: {}",
+                        property.key
+                    )));
+                }
+            }
+        }
+
+        // The group has no condition matchers or we matched the properties
+        if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) {
+            if let Some(ref variant_override) = group.variant {
+                Ok(GroupEvaluationResult::MatchedAndOverride(
+                    variant_override.clone(),
+                ))
+            } else {
+                Ok(GroupEvaluationResult::MatchedAndEvaluate)
+            }
+        } else {
+            Ok(GroupEvaluationResult::Unmatched)
+        }
+    }
+
+    /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors
+    /// during the evaluation.
+    ///
+    /// The parsing logic is as follows:
+    ///
+    /// * Match each filter group.
+    ///   - If a group is matched, it will first determine whether the user is in the range of the group's rollout
+    ///     percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash
+    ///     is shared across all groups.
+    ///   - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or
+    ///   - Evaluate the variant using the global config and the global rollout percentage.
+    /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the
+    ///   rollout percentage.
+    /// * If there are no matching groups, return an error.
+    ///
+    /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%).
+    /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override.
+    /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C.
+    pub fn evaluate_multivariate(
+        &self,
+        flag_key: &str,
+        user_id: &str,
+    ) -> Result<String, PostHogEvaluationError> {
+        let hash_on_global_rollout_percentage =
+            Self::consistent_hash(user_id, flag_key, "multivariate");
+        let hash_on_group_rollout_percentage =
+            Self::consistent_hash(user_id, flag_key, "within_group");
+        self.evaluate_multivariate_inner(
+            flag_key,
+            hash_on_global_rollout_percentage,
+            hash_on_group_rollout_percentage,
+            &HashMap::new(),
+        )
+    }
+
+    /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID
+    /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests
+    /// and avoid duplicate computations.
+    ///
+    /// Use a different consistent hash for evaluating the group rollout percentage.
+    /// The behavior: if the condition is set to rolling out to 10% of the users, and
+    /// we set the variant A to 20% in the global config, then 2% of the total users will
+    /// be evaluated to variant A.
+    ///
+    /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two
+    /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users
+    /// will be evaluated (versus 30% if group evaluation is done independently).
+    pub(crate) fn evaluate_multivariate_inner(
+        &self,
+        flag_key: &str,
+        hash_on_global_rollout_percentage: f64,
+        hash_on_group_rollout_percentage: f64,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<String, PostHogEvaluationError> {
+        if let Some(flag_config) = self.flags.get(flag_key) {
+            if !flag_config.active {
+                return Err(PostHogEvaluationError::NotAvailable(format!(
+                    "The feature flag is not active: {}",
+                    flag_key
+                )));
+            }
+            // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog
+            // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it
+            // does not matter.
+            for group in &flag_config.filters.groups {
+                match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? {
+                    GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant),
+                    GroupEvaluationResult::MatchedAndEvaluate => {
+                        let mut percentage = 0;
+                        for variant in &flag_config.filters.multivariate.variants {
+                            percentage += variant.rollout_percentage;
+                            if self
+                                .evaluate_percentage(hash_on_global_rollout_percentage, percentage)
+                            {
+                                return Ok(variant.key.clone());
+                            }
+                        }
+                        // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog
+                        // returned invalid spec, we return an error.
+                        return Err(PostHogEvaluationError::Internal(format!(
+                            "Rollout percentage does not add up to 100: {}",
+                            flag_key
+                        )));
+                    }
+                    GroupEvaluationResult::Unmatched => continue,
+                }
+            }
+            // If no group is matched, the feature is not available, and up to the caller to decide what to do.
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        } else {
+            // The feature flag is not available yet
+            Err(PostHogEvaluationError::NotAvailable(format!(
+                "Not found in the local evaluation spec: {}",
+                flag_key
+            )))
+        }
+    }
+}
+
+/// A lite PostHog client.
+///
+/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support.
+/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs
+/// that will be used within Neon.
+///
+/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed
+/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the
+/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is
+/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within
+/// our PostHog client.
+///
+/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we
+/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to
+/// configure feature flags so it is very likely that the client API will not be used.
+pub struct PostHogClient {
+    /// The server API key.
+    server_api_key: String,
+    /// The client API key.
+    client_api_key: String,
+    /// The project ID.
+    project_id: String,
+    /// The private API URL.
+    private_api_url: String,
+    /// The public API URL.
+    public_api_url: String,
+    /// The HTTP client.
+    client: reqwest::Client,
+}
+
+impl PostHogClient {
+    pub fn new(
+        server_api_key: String,
+        client_api_key: String,
+        project_id: String,
+        private_api_url: String,
+        public_api_url: String,
+    ) -> Self {
+        let client = reqwest::Client::new();
+        Self {
+            server_api_key,
+            client_api_key,
+            project_id,
+            private_api_url,
+            public_api_url,
+            client,
+        }
+    }
+
+    pub fn new_with_us_region(
+        server_api_key: String,
+        client_api_key: String,
+        project_id: String,
+    ) -> Self {
+        Self::new(
+            server_api_key,
+            client_api_key,
+            project_id,
+            "https://us.posthog.com".to_string(),
+            "https://us.i.posthog.com".to_string(),
+        )
+    }
+
+    /// Fetch the feature flag specs from the server.
+    ///
+    /// This is unfortunately an undocumented API at:
+    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
+    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
+    ///
+    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
+    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
+    pub async fn get_feature_flags_local_evaluation(
+        &self,
+    ) -> anyhow::Result<LocalEvaluationResponse> {
+        // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
+        // with bearer token of self.server_api_key
+        let url = format!(
+            "{}/api/projects/{}/feature_flags/local_evaluation",
+            self.private_api_url, self.project_id
+        );
+        let response = self
+            .client
+            .get(url)
+            .bearer_auth(&self.server_api_key)
+            .send()
+            .await?;
+        let body = response.text().await?;
+        Ok(serde_json::from_str(&body)?)
+    }
+
+    /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though
+    /// it also support a lot of other functionalities.
+    ///
+    /// <https://posthog.com/docs/api/capture>
+    pub async fn capture_event(
+        &self,
+        event: &str,
+        distinct_id: &str,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> anyhow::Result<()> {
+        // PUBLIC_URL/capture/
+        // with bearer token of self.client_api_key
+        let url = format!("{}/capture/", self.public_api_url);
+        self.client
+            .post(url)
+            .body(serde_json::to_string(&json!({
+                "api_key": self.client_api_key,
+                "distinct_id": distinct_id,
+                "event": event,
+                "properties": properties,
+            }))?)
+            .send()
+            .await?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn data() -> &'static str {
+        r#"{
+            "flags": [
+                {
+                    "id": 132794,
+                    "team_id": 152860,
+                    "name": "",
+                    "key": "gc-compaction",
+                    "filters": {
+                        "groups": [
+                            {
+                                "variant": "enabled-stage-2",
+                                "properties": [
+                                    {
+                                        "key": "plan_type",
+                                        "type": "person",
+                                        "value": [
+                                            "free"
+                                        ],
+                                        "operator": "exact"
+                                    },
+                                    {
+                                        "key": "pageserver_remote_size",
+                                        "type": "person",
+                                        "value": "10000000",
+                                        "operator": "lt"
+                                    }
+                                ],
+                                "rollout_percentage": 50
+                            },
+                            {
+                                "properties": [
+                                    {
+                                        "key": "plan_type",
+                                        "type": "person",
+                                        "value": [
+                                            "free"
+                                        ],
+                                        "operator": "exact"
+                                    },
+                                    {
+                                        "key": "pageserver_remote_size",
+                                        "type": "person",
+                                        "value": "10000000",
+                                        "operator": "lt"
+                                    }
+                                ],
+                                "rollout_percentage": 80
+                            }
+                        ],
+                        "payloads": {},
+                        "multivariate": {
+                            "variants": [
+                                {
+                                    "key": "disabled",
+                                    "name": "",
+                                    "rollout_percentage": 90
+                                },
+                                {
+                                    "key": "enabled-stage-1",
+                                    "name": "",
+                                    "rollout_percentage": 10
+                                },
+                                {
+                                    "key": "enabled-stage-2",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                },
+                                {
+                                    "key": "enabled-stage-3",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                },
+                                {
+                                    "key": "enabled",
+                                    "name": "",
+                                    "rollout_percentage": 0
+                                }
+                            ]
+                        }
+                    },
+                    "deleted": false,
+                    "active": true,
+                    "ensure_experience_continuity": false,
+                    "has_encrypted_payloads": false,
+                    "version": 6
+                }
+            ],
+            "group_type_mapping": {},
+            "cohorts": {}
+        }"#
+    }
+
+    #[test]
+    fn parse_local_evaluation() {
+        let data = data();
+        let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap();
+    }
+
+    #[test]
+    fn evaluate_multivariate() {
+        let mut store = FeatureStore::new();
+        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
+        store.set_flags(response.flags);
+
+        // This lacks the required properties and cannot be evaluated.
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new());
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NotAvailable(_))
+        ),);
+
+        let properties_unmatched = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("paid".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // This does not match any group so there will be an error.
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+        let variant =
+            store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+
+        let properties = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("free".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties);
+        assert_eq!(variant.unwrap(), "enabled-stage-2".to_string());
+
+        // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties);
+        assert_eq!(variant.unwrap(), "enabled-stage-1".to_string());
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties);
+        assert_eq!(variant.unwrap(), "disabled".to_string());
+
+        // It matches the group conditions but not the group rollout percentage.
+        let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+    }
+}
--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -86,6 +86,27 @@ pub struct DbError {
 }

 impl DbError {
+    pub fn new_test_error(code: SqlState, message: String) -> Self {
+        DbError {
+            severity: "ERROR".to_string(),
+            parsed_severity: Some(Severity::Error),
+            code,
+            message,
+            detail: None,
+            hint: None,
+            position: None,
+            where_: None,
+            schema: None,
+            table: None,
+            column: None,
+            datatype: None,
+            constraint: None,
+            file: None,
+            line: None,
+            routine: None,
+        }
+    }
+
    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
        let mut severity = None;
        let mut parsed_severity = None;
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -330,11 +330,18 @@ impl AzureBlobStorage {
                if let Err(DownloadError::Timeout) = &next_item {
                    timeout_try_cnt += 1;
                    if timeout_try_cnt <= 5 {
-                        continue;
+                        continue 'outer;
                    }
                }

-                let next_item = next_item?;
+                let next_item = match next_item {
+                    Ok(next_item) => next_item,
+                    Err(e) => {
+                        // The error is potentially retryable, so we must rewind the loop after yielding.
+                        yield Err(e);
+                        continue 'outer;
+                    },
+                };

                // Log a warning if we saw two timeouts in a row before a successful request
                if timeout_try_cnt > 2 {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket {
                    res = request => Ok(res),
                    _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
                    _ = cancel.cancelled() => Err(DownloadError::Cancelled),
-                }?;
+                };
+
+                if let Err(DownloadError::Timeout) = &response {
+                    yield Err(DownloadError::Timeout);
+                    continue 'outer;
+                }
+
+                let response = response?; // always yield cancellation errors and stop the stream

                let response = response
                    .context("Failed to list S3 prefixes")
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::fs::{self, File};
 use std::io::{self, Write};
-use std::os::fd::AsRawFd;
+use std::os::fd::AsFd;

 use camino::{Utf8Path, Utf8PathBuf};

@@ -210,13 +210,13 @@ pub fn overwrite(

 /// Syncs the filesystem for the given file descriptor.
 #[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> {
    // Linux guarantees durability for syncfs.
    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
    #[cfg(target_os = "linux")]
    {
        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+        nix::unistd::syncfs(fd).context("syncfs")?;
    }
    #[cfg(target_os = "macos")]
    {
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -11,9 +11,9 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
        #[cfg(all(target_os = "linux", target_env = "gnu"))]
        {
            nix::fcntl::renameat2(
-                None,
+                nix::fcntl::AT_FDCWD,
                src,
-                None,
+                nix::fcntl::AT_FDCWD,
                dst,
                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
            )
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,6 +1,6 @@
 //! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! File locking is done using [`nix::fcntl::Flock`] exclusive locks.
 //! The only consumer of this module is currently
 //! [`pid_file`](crate::pid_file). See the module-level comment
 //! there for potential pitfalls with lock files that are used
@@ -9,26 +9,25 @@
 use std::fs;
 use std::io::{Read, Write};
 use std::ops::Deref;
-use std::os::unix::prelude::AsRawFd;

 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno::EAGAIN;
-use nix::fcntl;
+use nix::fcntl::{Flock, FlockArg};

 use crate::crashsafe;

-/// A handle to an open and unlocked, but not-yet-written lock file.
+/// A handle to an open and flocked, but not-yet-written lock file.
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
    path: Utf8PathBuf,
-    file: fs::File,
+    file: Flock<fs::File>,
 }

 /// Returned by [`UnwrittenLockFile::write_content`].
 #[must_use]
-pub struct LockFileGuard(fs::File);
+pub struct LockFileGuard(Flock<fs::File>);

 impl Deref for LockFileGuard {
    type Target = fs::File;
@@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLo
        .open(lock_file_path)
        .context("open lock file")?;

-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    match res {
-        Ok(()) => Ok(UnwrittenLockFile {
+        Ok(lock_file) => Ok(UnwrittenLockFile {
            path: lock_file_path.to_owned(),
            file: lock_file,
        }),
-        Err(EAGAIN) => anyhow::bail!("file is already locked"),
-        Err(e) => Err(e).context("flock error"),
+        Err((_, EAGAIN)) => anyhow::bail!("file is already locked"),
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }

@@ -105,32 +101,37 @@ pub enum LockFileRead {
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
-    let mut lock_file = match res {
+    let lock_file = match res {
        Ok(f) => f,
        Err(e) => match e.kind() {
            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
            _ => return Err(e).context("open lock file"),
        },
    };
-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    // We need the content regardless of lock success / failure.
    // But, read it after flock so that, if it succeeded, the content is consistent.
-    let mut content = String::new();
-    lock_file
-        .read_to_string(&mut content)
-        .context("read lock file")?;
    match res {
-        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
-            LockFileGuard(lock_file),
-            content,
-        )),
-        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
-            not_locked_file: lock_file,
-            content,
-        }),
-        Err(e) => Err(e).context("flock error"),
+        Ok(mut locked_file) => {
+            let mut content = String::new();
+            locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::NotHeldByAnyProcess(
+                LockFileGuard(locked_file),
+                content,
+            ))
+        }
+        Err((mut not_locked_file, EAGAIN)) => {
+            let mut content = String::new();
+            not_locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::LockedByOtherProcess {
+                not_locked_file,
+                content,
+            })
+        }
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -127,12 +127,12 @@ macro_rules! __check_fields_present {

            match check_fields_present0($extractors) {
                Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
+                Ok(Unconfigured) if cfg!(feature = "testing") => {
                    // allow unconfigured in tests
                    Ok(())
                },
                Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
+                    panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#)
                },
                Err(missing) => Err(missing)
            }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -30,6 +30,7 @@ crc32c.workspace = true
 either.workspace = true
 fail.workspace = true
 futures.workspace = true
+hashlink.workspace = true
 hex.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
@@ -96,6 +97,7 @@ strum.workspace = true
 strum_macros.workspace = true
 wal_decoder.workspace = true
 smallvec.workspace = true
+twox-hash.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::error::Error as _;
+use std::time::Duration;

 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -819,4 +820,25 @@ impl Client {
            .await
            .map(|resp| resp.status())
    }
+
+    pub async fn activate_post_import(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        activate_timeline_timeout: Duration,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}",
+            self.mgmt_api_endpoint,
+            tenant_shard_id,
+            timeline_id,
+            activate_timeline_timeout.as_millis()
+        );
+
+        self.request(Method::PUT, uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,13 @@
+use std::env;
+use std::path::PathBuf;
+
+/// Generates Rust code from .proto Protobuf schemas, along with a binary file
+/// descriptor set for Protobuf schema reflection.
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
+    tonic_build::configure()
+        .bytes(["."])
+        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,233 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// The service can be accessed via e.g. grpcurl:
+//
+//    ```
+//    grpcurl \
+//      -plaintext \
+//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
+//      -H "neon-shard-id: 0b10" \
+//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
+//      -H "authorization: Bearer $JWT" \
+//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
+//      localhost:51051 page_api.PageService/CheckRelExists
+//    ```
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_api;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,19 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_api");
+
+    /// File descriptor set for Protobuf schema reflection. This allows using
+    /// e.g. grpcurl with the API.
+    pub const FILE_DESCRIPTOR_SET: &[u8] =
+        tonic::include_file_descriptor_set!("page_api_descriptor");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -65,6 +65,9 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -206,7 +209,12 @@ async fn main_impl(
                    for r in partitioning.keys.ranges.iter() {
                        let mut i = r.start;
                        while i != r.end {
-                            if i.is_rel_block_key() {
+                            let mut include = true;
+                            include &= i.is_rel_block_key();
+                            if let Some(only_relnode) = args.only_relnode {
+                                include &= i.is_rel_block_of_rel(only_relnode);
+                            }
+                            if include {
                                filtered.add_key(i);
                            }
                            i = i.next();
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -144,7 +144,7 @@ where
        replica,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf,
+            timeline.conf.get_vectored_concurrent_io,
            timeline
                .gate
                .enter()
@@ -343,7 +343,7 @@ where
            // Gather non-relational files from object storage pages.
            let slru_partitions = self
                .timeline
-                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+                .get_slru_keyspace(Version::at(self.lsn), self.ctx)
                .await?
                .partition(
                    self.timeline.get_shard_identity(),
@@ -378,7 +378,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -517,7 +517,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
+            .get_rel_size(src, Version::at(self.lsn), self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -577,7 +577,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                .await?;

            if img.len()
@@ -631,7 +631,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/basebackup_cache.rs
+++ b/pageserver/src/basebackup_cache.rs
@@ -0,0 +1,518 @@
+use std::{collections::HashMap, sync::Arc};
+
+use async_compression::tokio::write::GzipEncoder;
+use camino::{Utf8Path, Utf8PathBuf};
+use metrics::core::{AtomicU64, GenericCounter};
+use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
+use tokio::{
+    io::{AsyncWriteExt, BufWriter},
+    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+    shard::TenantShardId,
+};
+
+use crate::{
+    basebackup::send_basebackup_tarball,
+    context::{DownloadBehavior, RequestContext},
+    metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
+    task_mgr::TaskKind,
+    tenant::{
+        Timeline,
+        mgr::{TenantManager, TenantSlot},
+    },
+};
+
+pub struct BasebackupPrepareRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub lsn: Lsn,
+}
+
+pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
+pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
+
+type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
+type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
+
+/// BasebackupCache stores cached basebackup archives for timelines on local disk.
+///
+/// The main purpose of this cache is to speed up the startup process of compute nodes
+/// after scaling to zero.
+/// Thus, the basebackup is stored only for the latest LSN of the timeline and with
+/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none).
+///
+/// The cache receives prepare requests through the `BasebackupPrepareSender` channel,
+/// generates a basebackup from the timeline in the background, and stores it on disk.
+///
+/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache
+/// and ~1 RPS for get requests.
+pub struct BasebackupCache {
+    data_dir: Utf8PathBuf,
+    config: BasebackupCacheConfig,
+    tenant_manager: Arc<TenantManager>,
+    remove_entry_sender: BasebackupRemoveEntrySender,
+
+    entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
+
+    cancel: CancellationToken,
+
+    read_hit_count: GenericCounter<AtomicU64>,
+    read_miss_count: GenericCounter<AtomicU64>,
+    read_err_count: GenericCounter<AtomicU64>,
+
+    prepare_ok_count: GenericCounter<AtomicU64>,
+    prepare_skip_count: GenericCounter<AtomicU64>,
+    prepare_err_count: GenericCounter<AtomicU64>,
+}
+
+impl BasebackupCache {
+    /// Creates a BasebackupCache and spawns the background task.
+    /// The initialization of the cache is performed in the background and does not
+    /// block the caller. The cache will return `None` for any get requests until
+    /// initialization is complete.
+    pub fn spawn(
+        runtime_handle: &tokio::runtime::Handle,
+        data_dir: Utf8PathBuf,
+        config: Option<BasebackupCacheConfig>,
+        prepare_receiver: BasebackupPrepareReceiver,
+        tenant_manager: Arc<TenantManager>,
+        cancel: CancellationToken,
+    ) -> Arc<Self> {
+        let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
+
+        let enabled = config.is_some();
+
+        let cache = Arc::new(BasebackupCache {
+            data_dir,
+            config: config.unwrap_or_default(),
+            tenant_manager,
+            remove_entry_sender,
+
+            entries: std::sync::Mutex::new(HashMap::new()),
+
+            cancel,
+
+            read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
+            read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
+            read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
+
+            prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
+            prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
+            prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
+        });
+
+        if enabled {
+            runtime_handle.spawn(
+                cache
+                    .clone()
+                    .background(prepare_receiver, remove_entry_receiver),
+            );
+        }
+
+        cache
+    }
+
+    /// Gets a basebackup entry from the cache.
+    /// If the entry is found, opens a file with the basebackup archive and returns it.
+    /// The open file descriptor will prevent the file system from deleting the file
+    /// even if the entry is removed from the cache in the background.
+    pub async fn get(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Option<tokio::fs::File> {
+        // Fast path. Check if the entry exists using the in-memory state.
+        let tti = TenantTimelineId::new(tenant_id, timeline_id);
+        if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
+            self.read_miss_count.inc();
+            return None;
+        }
+
+        let path = self.entry_path(tenant_id, timeline_id, lsn);
+
+        match tokio::fs::File::open(path).await {
+            Ok(file) => {
+                self.read_hit_count.inc();
+                Some(file)
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // We may end up here if the basebackup was concurrently removed by the cleanup task.
+                    self.read_miss_count.inc();
+                } else {
+                    self.read_err_count.inc();
+                    tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e);
+                }
+                None
+            }
+        }
+    }
+
+    // Private methods.
+
+    fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
+        // The default format for LSN is 0/ABCDEF.
+        // The backslash is not filename friendly, so serialize it as plain hex.
+        let lsn = lsn.0;
+        format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz")
+    }
+
+    fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf {
+        self.data_dir
+            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
+    }
+
+    fn entry_tmp_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Utf8PathBuf {
+        self.data_dir
+            .join("tmp")
+            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
+    }
+
+    fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
+        let parts: Vec<&str> = filename
+            .strip_prefix("basebackup_")?
+            .strip_suffix(".tar.gz")?
+            .split('_')
+            .collect();
+        if parts.len() != 3 {
+            return None;
+        }
+        let tenant_id = parts[0].parse::<TenantId>().ok()?;
+        let timeline_id = parts[1].parse::<TimelineId>().ok()?;
+        let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?);
+
+        Some((tenant_id, timeline_id, lsn))
+    }
+
+    async fn cleanup(&self) -> anyhow::Result<()> {
+        // Cleanup tmp directory.
+        let tmp_dir = self.data_dir.join("tmp");
+        let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
+        while let Some(dir_entry) = tmp_dir.next_entry().await? {
+            if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
+                tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
+            }
+        }
+
+        // Remove outdated entries.
+        let entries_old = self.entries.lock().unwrap().clone();
+        let mut entries_new = HashMap::new();
+        for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
+            if !tenant_shard_id.is_shard_zero() {
+                continue;
+            }
+            let TenantSlot::Attached(tenant) = tenant_slot else {
+                continue;
+            };
+            let tenant_id = tenant_shard_id.tenant_id;
+
+            for timeline in tenant.list_timelines() {
+                let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
+                if let Some(&entry_lsn) = entries_old.get(&tti) {
+                    if timeline.get_last_record_lsn() <= entry_lsn {
+                        entries_new.insert(tti, entry_lsn);
+                    }
+                }
+            }
+        }
+
+        for (&tti, &lsn) in entries_old.iter() {
+            if !entries_new.contains_key(&tti) {
+                self.remove_entry_sender
+                    .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
+                    .unwrap();
+            }
+        }
+
+        BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
+        *self.entries.lock().unwrap() = entries_new;
+
+        Ok(())
+    }
+
+    async fn on_startup(&self) -> anyhow::Result<()> {
+        // Create data_dir and tmp directory if they do not exist.
+        tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
+            .await
+            .map_err(|e| {
+                anyhow::anyhow!(
+                    "Failed to create basebackup cache data_dir {:?}: {:?}",
+                    self.data_dir,
+                    e
+                )
+            })?;
+
+        // Read existing entries from the data_dir and add them to in-memory state.
+        let mut entries = HashMap::new();
+        let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
+        while let Some(dir_entry) = dir.next_entry().await? {
+            let filename = dir_entry.file_name();
+
+            if filename == "tmp" {
+                // Skip the tmp directory.
+                continue;
+            }
+
+            let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
+            let Some((tenant_id, timeline_id, lsn)) = parsed else {
+                tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
+                continue;
+            };
+
+            let tti = TenantTimelineId::new(tenant_id, timeline_id);
+
+            use std::collections::hash_map::Entry::*;
+
+            match entries.entry(tti) {
+                Occupied(mut entry) => {
+                    let entry_lsn = *entry.get();
+                    // Leave only the latest entry, remove the old one.
+                    if lsn < entry_lsn {
+                        self.remove_entry_sender.send(self.entry_path(
+                            tenant_id,
+                            timeline_id,
+                            lsn,
+                        ))?;
+                    } else if lsn > entry_lsn {
+                        self.remove_entry_sender.send(self.entry_path(
+                            tenant_id,
+                            timeline_id,
+                            entry_lsn,
+                        ))?;
+                        entry.insert(lsn);
+                    } else {
+                        // Two different filenames parsed to the same timline_id and LSN.
+                        // Should never happen.
+                        return Err(anyhow::anyhow!(
+                            "Duplicate basebackup cache entry with the same LSN: {:?}",
+                            filename
+                        ));
+                    }
+                }
+                Vacant(entry) => {
+                    entry.insert(lsn);
+                }
+            }
+        }
+
+        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
+        *self.entries.lock().unwrap() = entries;
+
+        Ok(())
+    }
+
+    async fn background(
+        self: Arc<Self>,
+        mut prepare_receiver: BasebackupPrepareReceiver,
+        mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
+    ) {
+        // Panic in the background is a safe fallback.
+        // It will drop receivers and the cache will be effectively disabled.
+        self.on_startup()
+            .await
+            .expect("Failed to initialize basebackup cache");
+
+        let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period);
+        cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+
+        loop {
+            tokio::select! {
+                Some(req) = prepare_receiver.recv() => {
+                    if let Err(err) = self.prepare_basebackup(
+                        req.tenant_shard_id,
+                        req.timeline_id,
+                        req.lsn,
+                    ).await {
+                        tracing::info!("Failed to prepare basebackup: {:#}", err);
+                        self.prepare_err_count.inc();
+                        continue;
+                    }
+                }
+                Some(req) = remove_entry_receiver.recv() => {
+                    if let Err(e) = tokio::fs::remove_file(req).await {
+                        tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
+                    }
+                }
+                _ = cleanup_ticker.tick() => {
+                    self.cleanup().await.unwrap_or_else(|e| {
+                        tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
+                    });
+                }
+                _ = self.cancel.cancelled() => {
+                    tracing::info!("BasebackupCache background task cancelled");
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Prepare a basebackup for the given timeline.
+    ///
+    /// If the basebackup already exists with a higher LSN or the timeline already
+    /// has a higher last_record_lsn, skip the preparation.
+    ///
+    /// The basebackup is prepared in a temporary directory and then moved to the final
+    /// location to make the operation atomic.
+    async fn prepare_basebackup(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        tracing::info!(
+            tenant_id = %tenant_shard_id.tenant_id,
+            %timeline_id,
+            %req_lsn,
+            "Preparing basebackup for timeline",
+        );
+
+        let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
+
+        {
+            let entries = self.entries.lock().unwrap();
+            if let Some(&entry_lsn) = entries.get(&tti) {
+                if entry_lsn >= req_lsn {
+                    tracing::info!(
+                        %timeline_id,
+                        %req_lsn,
+                        %entry_lsn,
+                        "Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
+                    );
+                    self.prepare_skip_count.inc();
+                    return Ok(());
+                }
+            }
+
+            if entries.len() as i64 >= self.config.max_size_entries {
+                tracing::info!(
+                    %timeline_id,
+                    %req_lsn,
+                    "Basebackup cache is full, skipping basebackup",
+                );
+                self.prepare_skip_count.inc();
+                return Ok(());
+            }
+        }
+
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        let tenant_state = tenant.current_state();
+        if tenant_state != TenantState::Active {
+            anyhow::bail!(
+                "Tenant {} is not active, current state: {:?}",
+                tenant_shard_id.tenant_id,
+                tenant_state
+            )
+        }
+
+        let timeline = tenant.get_timeline(timeline_id, true)?;
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn > req_lsn {
+            tracing::info!(
+                %timeline_id,
+                %req_lsn,
+                %last_record_lsn,
+                "Timeline has a higher LSN than the requested one, skipping basebackup",
+            );
+            self.prepare_skip_count.inc();
+            return Ok(());
+        }
+
+        let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
+
+        let res = self
+            .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
+            .await;
+
+        if let Err(err) = res {
+            tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
+            // Try to clean up tmp file. If we fail, the background clean up task will take care of it.
+            match tokio::fs::remove_file(&entry_tmp_path).await {
+                Ok(_) => {}
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+                Err(e) => {
+                    tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
+                }
+            }
+            return Err(err);
+        }
+
+        // Move the tmp file to the final location atomically.
+        let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
+        tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
+
+        let mut entries = self.entries.lock().unwrap();
+        if let Some(old_lsn) = entries.insert(tti, req_lsn) {
+            // Remove the old entry if it exists.
+            self.remove_entry_sender
+                .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
+                .unwrap();
+        }
+        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
+
+        self.prepare_ok_count.inc();
+        Ok(())
+    }
+
+    /// Prepares a basebackup in a temporary file.
+    async fn prepare_basebackup_tmp(
+        &self,
+        emptry_tmp_path: &Utf8Path,
+        timeline: &Arc<Timeline>,
+        req_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
+        let ctx = ctx.with_scope_timeline(timeline);
+
+        let file = tokio::fs::File::create(emptry_tmp_path).await?;
+        let mut writer = BufWriter::new(file);
+
+        let mut encoder = GzipEncoder::with_quality(
+            &mut writer,
+            // Level::Best because compression is not on the hot path of basebackup requests.
+            // The decompression is almost not affected by the compression level.
+            async_compression::Level::Best,
+        );
+
+        // We may receive a request before the WAL record is applied to the timeline.
+        // Wait for the requested LSN to be applied.
+        timeline
+            .wait_lsn(
+                req_lsn,
+                crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache,
+                crate::tenant::timeline::WaitLsnTimeout::Default,
+                &ctx,
+            )
+            .await?;
+
+        send_basebackup_tarball(
+            &mut encoder,
+            timeline,
+            Some(req_lsn),
+            None,
+            false,
+            false,
+            &ctx,
+        )
+        .await?;
+
+        encoder.shutdown().await?;
+        writer.flush().await?;
+        writer.into_inner().sync_all().await?;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
 use nix::sys::socket::{setsockopt, sockopt};
+use pageserver::basebackup_cache::BasebackupCache;
 use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -541,6 +542,8 @@ fn start_pageserver(
        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());

    // Scan the local 'tenants/' directory and start loading the tenants
+    let (basebackup_prepare_sender, basebackup_prepare_receiver) =
+        tokio::sync::mpsc::unbounded_channel();
    let deletion_queue_client = deletion_queue.new_client();
    let background_purges = mgr::BackgroundPurges::default();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -551,12 +554,22 @@ fn start_pageserver(
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        },
        order,
        shutdown_pageserver.clone(),
    ))?;
    let tenant_manager = Arc::new(tenant_manager);

+    let basebackup_cache = BasebackupCache::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        conf.basebackup_cache_dir(),
+        conf.basebackup_cache_config.clone(),
+        basebackup_prepare_receiver,
+        Arc::clone(&tenant_manager),
+        shutdown_pageserver.child_token(),
+    );
+
    BACKGROUND_RUNTIME.spawn({
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
@@ -763,6 +776,7 @@ fn start_pageserver(
        } else {
            None
        },
+        basebackup_cache,
    );

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -232,6 +232,8 @@ pub struct PageServerConf {
    pub dev_mode: bool,

    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
+
+    pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
 }

 /// Token for authentication to safekeepers
@@ -261,6 +263,10 @@ impl PageServerConf {
        self.workdir.join("metadata.json")
    }

+    pub fn basebackup_cache_dir(&self) -> Utf8PathBuf {
+        self.workdir.join("basebackup_cache")
+    }
+
    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
@@ -407,6 +413,7 @@ impl PageServerConf {
            enable_tls_page_service_api,
            dev_mode,
            timeline_import_config,
+            basebackup_cache_config,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -461,6 +468,7 @@ impl PageServerConf {
            enable_tls_page_service_api,
            dev_mode,
            timeline_import_config,
+            basebackup_cache_config,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -544,6 +552,23 @@ impl PageServerConf {
                    ratio.numerator, ratio.denominator
                )
            );
+
+            let url = Url::parse(&tracing_config.export_config.endpoint)
+                .map_err(anyhow::Error::msg)
+                .with_context(|| {
+                    format!(
+                        "tracing endpoint URL is invalid : {}",
+                        tracing_config.export_config.endpoint
+                    )
+                })?;
+
+            ensure!(
+                url.scheme() == "http" || url.scheme() == "https",
+                format!(
+                    "tracing endpoint URL must start with http:// or https://: {}",
+                    tracing_config.export_config.endpoint
+                )
+            );
        }

        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
@@ -660,4 +685,25 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect("parse_and_validate");
    }
+
+    #[test]
+    fn test_config_tracing_endpoint_is_invalid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+
+            [tracing]
+
+            sampling_ratio = { numerator = 1, denominator = 0 }
+
+            [tracing.export_config]
+            endpoint = "localhost:4317"
+            protocol = "http-binary"
+            timeout = "1ms"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("config has valid fields");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect_err("parse_and_validate should fail for endpoint without scheme");
+    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -18,12 +18,25 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 // management.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(super) enum Name {
-    /// Timeline last_record_lsn, absolute
+    /// Timeline last_record_lsn, absolute.
    #[serde(rename = "written_size")]
    WrittenSize,
    /// Timeline last_record_lsn, incremental
    #[serde(rename = "written_data_bytes_delta")]
    WrittenSizeDelta,
+    /// Written bytes only on this timeline (not including ancestors):
+    /// written_size - ancestor_lsn
+    ///
+    /// On the root branch, this is equivalent to `written_size`.
+    #[serde(rename = "written_size_since_parent")]
+    WrittenSizeSinceParent,
+    /// PITR history size only on this timeline (not including ancestors):
+    /// last_record_lsn - max(pitr_cutoff, ancestor_lsn).
+    ///
+    /// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed
+    /// the PITR cutoff yet. 0 if PITR is disabled.
+    #[serde(rename = "pitr_history_size_since_parent")]
+    PitrHistorySizeSinceParent,
    /// Timeline logical size
    #[serde(rename = "timeline_logical_size")]
    LogicalSize,
@@ -157,6 +170,32 @@ impl MetricsKey {
        .incremental_values()
    }

+    /// `written_size` - `ancestor_lsn`.
+    const fn written_size_since_parent(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSizeSinceParent,
+        }
+        .absolute_values()
+    }
+
+    /// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`).
+    const fn pitr_history_size_since_parent(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::PitrHistorySizeSinceParent,
+        }
+        .absolute_values()
+    }
+
    /// Exact [`Timeline::get_current_logical_size`].
    ///
    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
@@ -334,7 +373,13 @@ impl TenantSnapshot {
 struct TimelineSnapshot {
    loaded_at: (Lsn, SystemTime),
    last_record_lsn: Lsn,
+    ancestor_lsn: Lsn,
    current_exact_logical_size: Option<u64>,
+    /// Whether PITR is enabled (pitr_interval > 0).
+    pitr_enabled: bool,
+    /// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately
+    /// Some(last_record_lsn), but may lag behind it since it's computed periodically.
+    pitr_cutoff: Option<Lsn>,
 }

 impl TimelineSnapshot {
@@ -354,6 +399,9 @@ impl TimelineSnapshot {
        } else {
            let loaded_at = t.loaded_at;
            let last_record_lsn = t.get_last_record_lsn();
+            let ancestor_lsn = t.get_ancestor_lsn();
+            let pitr_enabled = !t.get_pitr_interval().is_zero();
+            let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time;

            let current_exact_logical_size = {
                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
@@ -373,7 +421,10 @@ impl TimelineSnapshot {
            Ok(Some(TimelineSnapshot {
                loaded_at,
                last_record_lsn,
+                ancestor_lsn,
                current_exact_logical_size,
+                pitr_enabled,
+                pitr_cutoff,
            }))
        }
    }
@@ -424,6 +475,8 @@ impl TimelineSnapshot {

        let up_to = now;

+        let written_size_last = written_size_now.value.max(prev.1); // don't regress
+
        if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
            // written_size_delta
@@ -441,6 +494,27 @@ impl TimelineSnapshot {
            });
        }

+        // Compute the branch-local written size.
+        let written_size_since_parent_key =
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id);
+        metrics.push(
+            written_size_since_parent_key
+                .at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)),
+        );
+
+        // Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the
+        // PITR cutoff. 0 if PITR is disabled.
+        let pitr_history_size_since_parent_key =
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id);
+        if !self.pitr_enabled {
+            metrics.push(pitr_history_size_since_parent_key.at(now, 0));
+        } else if let Some(pitr_cutoff) = self.pitr_cutoff {
+            metrics.push(pitr_history_size_since_parent_key.at(
+                now,
+                written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0),
+            ));
+        }
+
        {
            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
            let current_or_previous = self
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -12,12 +12,17 @@ fn startup_collected_timeline_metrics_before_advancing() {
    let cache = HashMap::new();

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, SystemTime::now()),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    let now = DateTime::<Utc>::from(SystemTime::now());
@@ -33,7 +38,11 @@ fn startup_collected_timeline_metrics_before_advancing() {
                0
            ),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }
@@ -49,7 +58,9 @@ fn startup_collected_timeline_metrics_second_round() {
    let before = DateTime::<Utc>::from(before);

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let mut metrics = Vec::new();
    let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
@@ -59,7 +70,10 @@ fn startup_collected_timeline_metrics_second_round() {
    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, init),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -69,7 +83,11 @@ fn startup_collected_timeline_metrics_second_round() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }
@@ -86,7 +104,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
    let before = DateTime::<Utc>::from(before);

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let mut metrics = Vec::new();
    let cache = HashMap::from([
@@ -103,7 +123,10 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, init),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -113,16 +136,18 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }

+/// Tests that written sizes do not regress across restarts.
 #[test]
 fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
-    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
-    // should never go backwards
-
    let tenant_id = TenantId::generate();
    let timeline_id = TimelineId::generate();

@@ -140,7 +165,10 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
    let snap = TimelineSnapshot {
        loaded_at: (Lsn(50), at_restart),
        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(0),
        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(20)),
    };

    let mut cache = HashMap::from([
@@ -169,6 +197,8 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
                0
            ),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80),
        ]
    );

@@ -183,6 +213,157 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80),
+        ]
+    );
+}
+
+/// Tests that written sizes do not regress across restarts, even on child branches.
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(40),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(20)),
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
+        ]
+    );
+}
+
+/// Tests that written sizes do not regress across restarts, even on child branches and
+/// with a PITR cutoff after the branch point.
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(30),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(40)),
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
        ]
    );
 }
@@ -201,7 +382,10 @@ fn post_restart_current_exact_logical_size_uses_cached() {
    let snap = TimelineSnapshot {
        loaded_at: (Lsn(50), at_restart),
        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(0),
        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: None,
    };

    let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
@@ -286,16 +470,101 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
    times
 }

+/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff
+/// indicates otherwise.
+#[test]
+fn pitr_disabled_yields_no_history_size() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: None,
+        pitr_enabled: false,
+        pitr_cutoff: Some(pitr_cutoff),
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
+        ]
+    );
+}
+
+/// Tests that uninitialized PITR cutoff does not emit any history size metric at all.
+#[test]
+fn pitr_uninitialized_does_not_emit_history_size() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: None,
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+        ]
+    );
+}
+
 pub(crate) const fn metric_examples_old(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [RawMetric; 5] {
+) -> [RawMetric; 7] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id)
            .from_until_old_format(before, now, 0),
+        MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
@@ -307,10 +576,12 @@ pub(crate) const fn metric_examples(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [NewRawMetric; 5] {
+) -> [NewRawMetric; 7] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+        MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
        MetricsKey::synthetic_size(tenant_id).at(now, 1),
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -513,6 +513,14 @@ mod tests {
                line!(),
                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
            (
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
@@ -560,7 +568,7 @@ mod tests {
        assert_eq!(upgraded_samples, new_samples);
    }

-    fn metric_samples_old() -> [RawMetric; 5] {
+    fn metric_samples_old() -> [RawMetric; 7] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

@@ -572,7 +580,7 @@ mod tests {
        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
    }

-    fn metric_samples() -> [NewRawMetric; 5] {
+    fn metric_samples() -> [NewRawMetric; 7] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
    PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
-    ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse,
 };
 use reqwest::Certificate;
 use serde::Serialize;
@@ -51,8 +51,15 @@ pub trait StorageControllerUpcallApi {
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        generation: Generation,
        status: ShardImportStatus,
    ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
+    fn get_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        generation: Generation,
+    ) -> impl Future<Output = Result<ShardImportStatus, RetryForeverError>> + Send;
 }

 impl StorageControllerUpcallClient {
@@ -97,6 +104,7 @@ impl StorageControllerUpcallClient {
        &self,
        url: &url::Url,
        request: R,
+        method: reqwest::Method,
    ) -> Result<T, RetryForeverError>
    where
        R: Serialize,
@@ -106,7 +114,7 @@ impl StorageControllerUpcallClient {
            || async {
                let response = self
                    .http_client
-                    .post(url.clone())
+                    .request(method.clone(), url.clone())
                    .json(&request)
                    .send()
                    .await?;
@@ -215,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
            register: register.clone(),
        };

-        let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
+        let response: ReAttachResponse = self
+            .retry_http_forever(&url, request, reqwest::Method::POST)
+            .await?;
        tracing::info!(
            "Received re-attach response with {} tenants (node {}, register: {:?})",
            response.tenants.len(),
@@ -268,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                return Err(RetryForeverError::ShuttingDown);
            }

-            let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
+            let response: ValidateResponse = self
+                .retry_http_forever(&url, request, reqwest::Method::POST)
+                .await?;
            for rt in response.tenants {
                result.insert(rt.id, rt.valid);
            }
@@ -287,6 +299,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        generation: Generation,
        status: ShardImportStatus,
    ) -> Result<(), RetryForeverError> {
        let url = self
@@ -297,9 +310,35 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        let request = PutTimelineImportStatusRequest {
            tenant_shard_id,
            timeline_id,
+            generation,
            status,
        };

-        self.retry_http_forever(&url, request).await
+        self.retry_http_forever(&url, request, reqwest::Method::POST)
+            .await
+    }
+
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
+    async fn get_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        generation: Generation,
+    ) -> Result<ShardImportStatus, RetryForeverError> {
+        let url = self
+            .base_url
+            .join("timeline_import_status")
+            .expect("Failed to build path");
+
+        let request = TimelineImportStatusRequest {
+            tenant_shard_id,
+            timeline_id,
+            generation,
+        };
+
+        let response: ShardImportStatus = self
+            .retry_http_forever(&url, request, reqwest::Method::GET)
+            .await?;
+        Ok(response)
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -663,6 +663,7 @@ mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
    use pageserver_api::key::Key;
+    use pageserver_api::models::ShardImportStatus;
    use pageserver_api::shard::ShardIndex;
    use pageserver_api::upcall_api::ReAttachResponseTenant;
    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -792,10 +793,20 @@ mod test {
            &self,
            _tenant_shard_id: TenantShardId,
            _timeline_id: TimelineId,
+            _generation: Generation,
            _status: pageserver_api::models::ShardImportStatus,
        ) -> Result<(), RetryForeverError> {
            unimplemented!()
        }
+
+        async fn get_timeline_import_status(
+            &self,
+            _tenant_shard_id: TenantShardId,
+            _timeline_id: TimelineId,
+            _generation: Generation,
+        ) -> Result<ShardImportStatus, RetryForeverError> {
+            unimplemented!()
+        }
    }

    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
    // actually trimmed data to), which can pass each other when PITR is changed.
    let min_readable_lsn = std::cmp::max(
-        timeline.get_gc_cutoff_lsn(),
+        timeline.get_gc_cutoff_lsn().unwrap_or_default(),
        *timeline.get_applied_gc_cutoff_lsn(),
    );

@@ -3199,7 +3199,7 @@ async fn list_aux_files(
            .await?;

    let io_concurrency = IoConcurrency::spawn_from_conf(
-        state.conf,
+        state.conf.get_vectored_concurrent_io,
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

@@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal(
    }.instrument(span).await
 }

+/// Activate a timeline after its import has completed
+///
+/// The endpoint is idempotent and callers are expected to retry all
+/// errors until a successful response.
+async fn activate_post_import_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1);
+    let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")?
+        .map(Duration::from_millis)
+        .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT);
+
+    let span = info_span!(
+        "activate_post_import_handler",
+        tenant_id=%tenant_shard_id.tenant_id,
+        timeline_id=%timeline_id,
+        shard_id=%tenant_shard_id.shard_slug()
+    );
+
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        tenant
+            .finalize_importing_timeline(timeline_id)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        match tenant.get_timeline(timeline_id, false) {
+            Ok(_timeline) => {
+                // Timeline is already visible. Reset not required: fall through.
+            }
+            Err(GetTimelineError::NotFound { .. }) => {
+                // This is crude: we reset the whole tenant such that the new timeline is detected
+                // and activated. We can come up with something more granular in the future.
+                //
+                // Note that we only reset the tenant if required: when the timeline is
+                // not present in [`Tenant::timelines`].
+                let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+                state
+                    .tenant_manager
+                    .reset_tenant(tenant_shard_id, false, &ctx)
+                    .await
+                    .map_err(ApiError::InternalServerError)?;
+            }
+            Err(GetTimelineError::ShuttingDown) => {
+                return Err(ApiError::ShuttingDown);
+            }
+            Err(GetTimelineError::NotActive { .. }) => {
+                unreachable!("Called get_timeline with active_only=false");
+            }
+        }
+
+        let timeline = tenant.get_timeline(timeline_id, false)?;
+
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn)
+            .with_scope_timeline(&timeline);
+
+        let result =
+            tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await;
+        match result {
+            Ok(Ok(())) => {
+                // fallthrough
+            }
+            // Timeline reached some other state that's not active
+            // TODO(vlad): if the tenant is broken, return a permananet error
+            Ok(Err(_timeline_state)) => {
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Timeline activation failed"
+                )));
+            }
+            // Activation timed out
+            Err(_) => {
+                return Err(ApiError::Timeout("Timeline activation timed out".into()));
+            }
+        }
+
+        let timeline_info = build_timeline_info(
+            &timeline, false, // include_non_incremental_logical_size,
+            false, // force_await_initial_logical_size
+            &ctx,
+        )
+        .await
+        .context("get local timeline info")
+        .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, timeline_info)
+    }
+    .instrument(span)
+    .await
+}
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -3924,5 +4025,9 @@ pub fn make_router(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
            |r| api_handler(r, put_tenant_timeline_import_wal),
        )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
+            |r| api_handler(r, activate_post_import_handler),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,6 +3,7 @@

 mod auth;
 pub mod basebackup;
+pub mod basebackup_cache;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -843,23 +843,50 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
-        "pageserver_relsize_cache_entries",
-        "Number of entries in the relation size cache",
+        "pageserver_relsize_latest_cache_entries",
+        "Number of entries in the latest relation size cache",
    )
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
-        .expect("failed to define a metric")
+pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_latest_cache_hits",
+        "Latest relation size cache hits",
+    )
+    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
-        "pageserver_relsize_cache_misses",
-        "Relation size cache misses",
+        "pageserver_relsize_latest_cache_misses",
+        "Relation size latest cache misses",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_relsize_snapshot_cache_entries",
+        "Number of entries in the pitr relation size cache",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_snapshot_cache_hits",
+        "Pitr relation size cache hits",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_snapshot_cache_misses",
+        "Relation size snapshot cache misses",
    )
    .expect("failed to define a metric")
 });
@@ -1039,6 +1066,15 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

+pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_offloaded_timelines",
+        "Number of offloaded timelines of a tenant",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("Failed to register pageserver_tenant_offloaded_timelines metric")
+});
+
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -3524,11 +3560,14 @@ impl TimelineMetrics {
 }

 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+    let tid = tenant_shard_id.tenant_id.to_string();
+    let shard_id = tenant_shard_id.shard_slug().to_string();
+
    // Only shard zero deals in synthetic sizes
    if tenant_shard_id.is_shard_zero() {
-        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
+    let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);

    tenant_throttling::remove_tenant_metrics(tenant_shard_id);

@@ -4320,6 +4359,42 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
        .set(u64::try_from(num_threads.get()).unwrap());
 }

+pub(crate) static BASEBACKUP_CACHE_READ: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_basebackup_cache_read_total",
+        "Number of read accesses to the basebackup cache grouped by hit/miss/error",
+        &["result"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_basebackup_cache_prepare_total",
+        "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error",
+        &["result"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_basebackup_cache_entries_total",
+        "Number of entries in the basebackup cache"
+    )
+    .expect("failed to define a metric")
+});
+
+// FIXME: Support basebackup cache size metrics.
+#[allow(dead_code)]
+pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_basebackup_cache_size_bytes",
+        "Total size of all basebackup cache entries on disk in bytes"
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_config_ignored_items",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,7 +9,6 @@ use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

-use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
@@ -18,7 +17,7 @@ use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
-    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
@@ -52,8 +51,10 @@ use utils::simple_rcu::RcuReadGuard;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;

+use crate::PERF_TRACE_TARGET;
 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -62,7 +63,7 @@ use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
    SmgrOpTimer, TimelineMetrics,
 };
-use crate::pgdatadir_mapping::Version;
+use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -107,6 +108,7 @@ pub fn spawn(
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
    tls_config: Option<Arc<rustls::ServerConfig>>,
+    basebackup_cache: Arc<BasebackupCache>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -128,6 +130,7 @@ pub fn spawn(
            conf.pg_auth_type,
            tls_config,
            conf.page_service_pipelining.clone(),
+            basebackup_cache,
            libpq_ctx,
            cancel.clone(),
        )
@@ -186,6 +189,7 @@ pub async fn libpq_listener_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
+    basebackup_cache: Arc<BasebackupCache>,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
 ) -> Connections {
@@ -229,6 +233,7 @@ pub async fn libpq_listener_main(
                    auth_type,
                    tls_config.clone(),
                    pipelining_config.clone(),
+                    Arc::clone(&basebackup_cache),
                    connection_ctx,
                    connections_cancel.child_token(),
                    gate_guard,
@@ -271,6 +276,7 @@ async fn page_service_conn_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
+    basebackup_cache: Arc<BasebackupCache>,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
    gate_guard: GateGuard,
@@ -331,11 +337,12 @@ async fn page_service_conn_main(
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(
-        conf,
        tenant_manager,
        auth,
        pipelining_config,
+        conf.get_vectored_concurrent_io,
        perf_span_fields,
+        basebackup_cache,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -371,7 +378,6 @@ async fn page_service_conn_main(
 }

 struct PageServerHandler {
-    conf: &'static PageServerConf,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -389,6 +395,9 @@ struct PageServerHandler {
    timeline_handles: Option<TimelineHandles>,

    pipelining_config: PageServicePipeliningConfig,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,
+
+    basebackup_cache: Arc<BasebackupCache>,

    gate_guard: GateGuard,
 }
@@ -642,7 +651,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
    timer: SmgrOpTimer,
-    effective_request_lsn: Lsn,
+    lsn_range: LsnRange,
    ctx: RequestContext,
 }

@@ -764,12 +773,12 @@ impl BatchedFeMessage {
                match batching_strategy {
                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
                        if let Some(last_in_batch) = accum_pages.last() {
-                            if last_in_batch.effective_request_lsn
-                                != this_pages[0].effective_request_lsn
+                            if last_in_batch.lsn_range.effective_lsn
+                                != this_pages[0].lsn_range.effective_lsn
                            {
                                trace!(
-                                    accum_lsn = %last_in_batch.effective_request_lsn,
-                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    accum_lsn = %last_in_batch.lsn_range.effective_lsn,
+                                    this_lsn = %this_pages[0].lsn_range.effective_lsn,
                                    "stopping batching because LSN changed"
                                );

@@ -784,15 +793,15 @@ impl BatchedFeMessage {
                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
                            batched.req.rel == this_pages[0].req.rel
                                && batched.req.blkno == this_pages[0].req.blkno
-                                && batched.effective_request_lsn
-                                    != this_pages[0].effective_request_lsn
+                                && batched.lsn_range.effective_lsn
+                                    != this_pages[0].lsn_range.effective_lsn
                        });

                        if same_page_different_lsn {
                            trace!(
                                rel=%this_pages[0].req.rel,
                                blkno=%this_pages[0].req.blkno,
-                                lsn=%this_pages[0].effective_request_lsn,
+                                lsn=%this_pages[0].lsn_range.effective_lsn,
                                "stopping batching because same page was requested at different LSNs"
                            );

@@ -844,17 +853,17 @@ impl BatchedFeMessage {
 impl PageServerHandler {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
-        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
+        basebackup_cache: Arc<BasebackupCache>,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
    ) -> Self {
        PageServerHandler {
-            conf,
            auth,
            claims: None,
            connection_ctx,
@@ -862,6 +871,8 @@ impl PageServerHandler {
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
+            get_vectored_concurrent_io,
+            basebackup_cache,
            gate_guard,
        }
    }
@@ -1158,7 +1169,7 @@ impl PageServerHandler {
                .await?;

                // We're holding the Handle
-                let effective_request_lsn = match Self::effective_request_lsn(
+                let effective_lsn = match Self::effective_request_lsn(
                    &shard,
                    shard.get_last_record_lsn(),
                    req.hdr.request_lsn,
@@ -1177,7 +1188,10 @@ impl PageServerHandler {
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
                        timer,
-                        effective_request_lsn,
+                        lsn_range: LsnRange {
+                            effective_lsn,
+                            request_lsn: req.hdr.request_lsn
+                        },
                        ctx,
                    }],
                    // The executor grabs the batch when it becomes idle.
@@ -1278,7 +1292,7 @@ impl PageServerHandler {
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all)]
-    async fn pagesteam_handle_batched_message<IO>(
+    async fn pagestream_handle_batched_message<IO>(
        &mut self,
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
@@ -1623,7 +1637,7 @@ impl PageServerHandler {
        }

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.get_vectored_concurrent_io,
            match self.gate_guard.try_clone() {
                Ok(guard) => guard,
                Err(_) => {
@@ -1733,7 +1747,7 @@ impl PageServerHandler {
            };

            let result = self
-                .pagesteam_handle_batched_message(
+                .pagestream_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
@@ -1909,7 +1923,7 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    self.pagesteam_handle_batched_message(
+                    self.pagestream_handle_batched_message(
                        pgb_writer,
                        batch,
                        io_concurrency.clone(),
@@ -2127,7 +2141,14 @@ impl PageServerHandler {
        .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_exists(
+                req.rel,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -2154,7 +2175,14 @@ impl PageServerHandler {
        .await?;

        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_size(
+                req.rel,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -2181,7 +2209,15 @@ impl PageServerHandler {
        .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -2214,7 +2250,7 @@ impl PageServerHandler {
                // Ignore error (trace buffer may be full or tracer may have disconnected).
                _ = page_trace.try_send(PageTraceEvent {
                    key,
-                    effective_lsn: batch.effective_request_lsn,
+                    effective_lsn: batch.lsn_range.effective_lsn,
                    time,
                });
            }
@@ -2229,7 +2265,7 @@ impl PageServerHandler {
                    perf_instrument = true;
                }

-                req.effective_request_lsn
+                req.lsn_range.effective_lsn
            })
            .max()
            .expect("batch is never empty");
@@ -2283,7 +2319,7 @@ impl PageServerHandler {
                    (
                        &p.req.rel,
                        &p.req.blkno,
-                        p.effective_request_lsn,
+                        p.lsn_range,
                        p.ctx.attached_child(),
                    )
                }),
@@ -2468,6 +2504,8 @@ impl PageServerHandler {
            .map_err(QueryError::Disconnected)?;
        self.flush_cancellable(pgb, &self.cancel).await?;

+        let mut from_cache = false;
+
        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
        if full_backup {
@@ -2485,7 +2523,33 @@ impl PageServerHandler {
            .map_err(map_basebackup_error)?;
        } else {
            let mut writer = BufWriter::new(pgb.copyout_writer());
-            if gzip {
+
+            let cached = {
+                // Basebackup is cached only for this combination of parameters.
+                if timeline.is_basebackup_cache_enabled()
+                    && gzip
+                    && lsn.is_some()
+                    && prev_lsn.is_none()
+                {
+                    self.basebackup_cache
+                        .get(tenant_id, timeline_id, lsn.unwrap())
+                        .await
+                } else {
+                    None
+                }
+            };
+
+            if let Some(mut cached) = cached {
+                from_cache = true;
+                tokio::io::copy(&mut cached, &mut writer)
+                    .await
+                    .map_err(|e| {
+                        map_basebackup_error(BasebackupError::Client(
+                            e,
+                            "handle_basebackup_request,cached,copy",
+                        ))
+                    })?;
+            } else if gzip {
                let mut encoder = GzipEncoder::with_quality(
                    &mut writer,
                    // NOTE using fast compression because it's on the critical path
@@ -2544,6 +2608,7 @@ impl PageServerHandler {
        info!(
            lsn_await_millis = lsn_awaited_after.as_millis(),
            basebackup_millis = basebackup_after.as_millis(),
+            %from_cache,
            "basebackup complete"
        );

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,7 +43,9 @@ use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
-    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+    RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
+    RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
+    RELSIZE_SNAPSHOT_CACHE_MISSES,
 };
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
@@ -90,6 +92,28 @@ pub enum LsnForTimestamp {
    NoData(Lsn),
 }

+/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
+/// See comments libs/pageserver_api/src/models.rs.
+/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
+/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct LsnRange {
+    pub effective_lsn: Lsn,
+    pub request_lsn: Lsn,
+}
+
+impl LsnRange {
+    pub fn at(lsn: Lsn) -> LsnRange {
+        LsnRange {
+            effective_lsn: lsn,
+            request_lsn: lsn,
+        }
+    }
+    pub fn is_latest(&self) -> bool {
+        self.request_lsn == Lsn::MAX
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CalculateLogicalSizeError {
    #[error("cancelled")]
@@ -202,13 +226,13 @@ impl Timeline {
        io_concurrency: IoConcurrency,
    ) -> Result<Bytes, PageReconstructError> {
        match version {
-            Version::Lsn(effective_lsn) => {
+            Version::LsnRange(lsns) => {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages.iter().map(|(tag, blknum)| {
-                            (tag, blknum, effective_lsn, ctx.attached_child())
-                        }),
+                        pages
+                            .iter()
+                            .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
                        io_concurrency.clone(),
                        ctx,
                    )
@@ -246,7 +270,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
    ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -265,7 +289,7 @@ impl Timeline {
        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
            HashMap::with_capacity(pages.len());

-        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
+        for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -274,7 +298,7 @@ impl Timeline {
                slots_filled += 1;
                continue;
            }
-
+            let lsn = lsns.effective_lsn;
            let nblocks = {
                let ctx = RequestContextBuilder::from(&ctx)
                    .perf_span(|crnt_perf_span| {
@@ -289,7 +313,7 @@ impl Timeline {
                    .attached_child();

                match self
-                    .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
+                    .get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                    .await
                {
@@ -470,7 +494,7 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
            return Ok(nblocks);
        }

@@ -488,7 +512,7 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+        self.update_cached_rel_size(tag, version, nblocks);

        Ok(nblocks)
    }
@@ -510,7 +534,7 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
            return Ok(true);
        }
        // then check if the database was already initialized.
@@ -586,7 +610,7 @@ impl Timeline {
        // scan directory listing (new), merge with the old results
        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -632,7 +656,7 @@ impl Timeline {
    ) -> Result<Bytes, PageReconstructError> {
        assert!(self.tenant_shard_id.is_shard_zero());
        let n_blocks = self
-            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
+            .get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
            .await?;

        let keyspace = KeySpace::single(
@@ -645,7 +669,7 @@ impl Timeline {
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -867,11 +891,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
                .await?;

            let keyspace = KeySpace::single(
@@ -885,7 +909,7 @@ impl Timeline {
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| PageReconstructError::Cancelled)?,
@@ -1137,7 +1161,7 @@ impl Timeline {
        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
                .await?
            {
                if self.cancel.is_cancelled() {
@@ -1212,7 +1236,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(spcnode, dbnode, Version::at(lsn), ctx)
                .await?
                .into_iter()
                .collect();
@@ -1329,59 +1353,75 @@ impl Timeline {
        Ok((dense_keyspace, sparse_keyspace))
    }

-    /// Get cached size of relation if it not updated after specified LSN
-    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
-        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
-            if lsn >= *cached_lsn {
-                RELSIZE_CACHE_HITS.inc();
-                return Some(*nblocks);
+    /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
+    /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
+    /// at the particular LSN (snapshot).
+    pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
+        let lsn = version.get_lsn();
+        {
+            let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
+            if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+                if lsn >= *cached_lsn {
+                    RELSIZE_LATEST_CACHE_HITS.inc();
+                    return Some(*nblocks);
+                }
+                RELSIZE_CACHE_MISSES_OLD.inc();
            }
-            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-        RELSIZE_CACHE_MISSES.inc();
+        {
+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
+                RELSIZE_SNAPSHOT_CACHE_HITS.inc();
+                return Some(*nblock);
+            }
+        }
+        if version.is_latest() {
+            RELSIZE_LATEST_CACHE_MISSES.inc();
+        } else {
+            RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
+        }
        None
    }

    /// Update cached relation size if there is no more recent update
-    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-
-        if lsn < rel_size_cache.complete_as_of {
-            // Do not cache old values. It's safe to cache the size on read, as long as
-            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
-            // never evict values from the cache, so if the relation size changed after
-            // 'lsn', the new value is already in the cache.
-            return;
-        }
-
-        match rel_size_cache.map.entry(tag) {
-            hash_map::Entry::Occupied(mut entry) => {
-                let cached_lsn = entry.get_mut();
-                if lsn >= cached_lsn.0 {
-                    *cached_lsn = (lsn, nblocks);
+    pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
+        let lsn = version.get_lsn();
+        if version.is_latest() {
+            let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+            match rel_size_cache.entry(tag) {
+                hash_map::Entry::Occupied(mut entry) => {
+                    let cached_lsn = entry.get_mut();
+                    if lsn >= cached_lsn.0 {
+                        *cached_lsn = (lsn, nblocks);
+                    }
+                }
+                hash_map::Entry::Vacant(entry) => {
+                    entry.insert((lsn, nblocks));
+                    RELSIZE_LATEST_CACHE_ENTRIES.inc();
                }
            }
-            hash_map::Entry::Vacant(entry) => {
-                entry.insert((lsn, nblocks));
-                RELSIZE_CACHE_ENTRIES.inc();
+        } else {
+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if rel_size_cache.capacity() != 0 {
+                rel_size_cache.insert((lsn, tag), nblocks);
+                RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
            }
        }
    }

    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
-            RELSIZE_CACHE_ENTRIES.inc();
+        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+        if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_LATEST_CACHE_ENTRIES.inc();
        }
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.remove(tag).is_some() {
-            RELSIZE_CACHE_ENTRIES.dec();
+        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+        if rel_size_cache.remove(tag).is_some() {
+            RELSIZE_LATEST_CACHE_ENTRIES.dec();
        }
    }
 }
@@ -1585,7 +1625,10 @@ impl DatadirModification<'_> {
        //       check the cache too. This is because eagerly checking the cache results in
        //       less work overall and 10% better performance. It's more work on cache miss
        //       but cache miss is rare.
-        if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
+        if let Some(nblocks) = self
+            .tline
+            .get_cached_rel_size(&rel, Version::Modified(self))
+        {
            Ok(nblocks)
        } else if !self
            .tline
@@ -2667,7 +2710,7 @@ pub struct DatadirModificationStats {
 /// timeline to not miss the latest updates.
 #[derive(Clone, Copy)]
 pub enum Version<'a> {
-    Lsn(Lsn),
+    LsnRange(LsnRange),
    Modified(&'a DatadirModification<'a>),
 }

@@ -2679,7 +2722,7 @@ impl Version<'_> {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        match self {
-            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
            Version::Modified(modification) => modification.get(key, ctx).await,
        }
    }
@@ -2701,12 +2744,26 @@ impl Version<'_> {
        }
    }

-    fn get_lsn(&self) -> Lsn {
+    pub fn is_latest(&self) -> bool {
        match self {
-            Version::Lsn(lsn) => *lsn,
+            Version::LsnRange(lsns) => lsns.is_latest(),
+            Version::Modified(_) => true,
+        }
+    }
+
+    pub fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::LsnRange(lsns) => lsns.effective_lsn,
            Version::Modified(modification) => modification.lsn,
        }
    }
+
+    pub fn at(lsn: Lsn) -> Self {
+        Version::LsnRange(LsnRange {
+            effective_lsn: lsn,
+            request_lsn: lsn,
+        })
+    }
 }

 //--- Metadata structs stored in key-value pairs in the repository.
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -380,6 +380,10 @@ pub enum TaskKind {
    DetachAncestor,

    ImportPgdata,
+
+    /// Background task of [`crate::basebackup_cache::BasebackupCache`].
+    /// Prepares basebackups and clears outdated entries.
+    BasebackupCache,
 }

 #[derive(Default)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -50,6 +50,7 @@ use remote_timeline_client::{
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
 use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
+use timeline::import_pgdata::ImportingTimeline;
 use timeline::offload::{OffloadError, offload_timeline};
 use timeline::{
    CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata,
@@ -77,6 +78,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
 use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
+use crate::basebackup_cache::BasebackupPrepareSender;
 use crate::config::PageServerConf;
 use crate::context;
 use crate::context::RequestContextBuilder;
@@ -85,8 +87,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
-    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
-    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
@@ -156,6 +158,7 @@ pub struct TenantSharedResources {
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
+    pub basebackup_prepare_sender: BasebackupPrepareSender,
 }

 /// A [`TenantShard`] is really an _attached_ tenant.  The configuration
@@ -284,6 +287,19 @@ pub struct TenantShard {
    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,

+    /// Tracks the timelines that are currently importing into this tenant shard.
+    ///
+    /// Note that importing timelines are also present in [`Self::timelines_creating`].
+    /// Keep this in mind when ordering lock acquisition.
+    ///
+    /// Lifetime:
+    /// * An imported timeline is created while scanning the bucket on tenant attach
+    ///   if the index part contains an `import_pgdata` entry and said field marks the import
+    ///   as in progress.
+    /// * Imported timelines are removed when the storage controller calls the post timeline
+    ///   import activation endpoint.
+    timelines_importing: std::sync::Mutex<HashMap<TimelineId, ImportingTimeline>>,
+
    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
    /// been either downloaded or uploaded. Always Some after tenant attach.
    ///
@@ -303,12 +319,15 @@ pub struct TenantShard {
    gc_cs: tokio::sync::Mutex<()>,
    walredo_mgr: Option<Arc<WalRedoManager>>,

-    // provides access to timeline data sitting in the remote storage
+    /// Provides access to timeline data sitting in the remote storage.
    pub(crate) remote_storage: GenericRemoteStorage,

-    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    /// Access to global deletion queue for when this tenant wants to schedule a deletion.
    deletion_queue_client: DeletionQueueClient,

+    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
+    basebackup_prepare_sender: BasebackupPrepareSender,
+
    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -923,19 +942,10 @@ enum StartCreatingTimelineResult {

 #[allow(clippy::large_enum_variant, reason = "TODO")]
 enum TimelineInitAndSyncResult {
-    ReadyToActivate(Arc<Timeline>),
+    ReadyToActivate,
    NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
 }

-impl TimelineInitAndSyncResult {
-    fn ready_to_activate(self) -> Option<Arc<Timeline>> {
-        match self {
-            Self::ReadyToActivate(timeline) => Some(timeline),
-            _ => None,
-        }
-    }
-}
-
 #[must_use]
 struct TimelineInitAndSyncNeedsSpawnImportPgdata {
    timeline: Arc<Timeline>,
@@ -1012,10 +1022,6 @@ enum CreateTimelineCause {
 enum LoadTimelineCause {
    Attach,
    Unoffload,
-    ImportPgdata {
-        create_guard: TimelineCreateGuard,
-        activate: ActivateTimelineArgs,
-    },
 }

 #[derive(thiserror::Error, Debug)]
@@ -1097,7 +1103,7 @@ impl TenantShard {
        self: &Arc<Self>,
        timeline_id: TimelineId,
        resources: TimelineResources,
-        mut index_part: IndexPart,
+        index_part: IndexPart,
        metadata: TimelineMetadata,
        previous_heatmap: Option<PreviousHeatmap>,
        ancestor: Option<Arc<Timeline>>,
@@ -1106,7 +1112,7 @@ impl TenantShard {
    ) -> anyhow::Result<TimelineInitAndSyncResult> {
        let tenant_id = self.tenant_shard_id;

-        let import_pgdata = index_part.import_pgdata.take();
+        let import_pgdata = index_part.import_pgdata.clone();
        let idempotency = match &import_pgdata {
            Some(import_pgdata) => {
                CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata {
@@ -1127,7 +1133,7 @@ impl TenantShard {
            }
        };

-        let (timeline, timeline_ctx) = self.create_timeline_struct(
+        let (timeline, _timeline_ctx) = self.create_timeline_struct(
            timeline_id,
            &metadata,
            previous_heatmap,
@@ -1197,14 +1203,6 @@ impl TenantShard {

        match import_pgdata {
            Some(import_pgdata) if !import_pgdata.is_done() => {
-                match cause {
-                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
-                    LoadTimelineCause::ImportPgdata { .. } => {
-                        unreachable!(
-                            "ImportPgdata should not be reloading timeline import is done and persisted as such in s3"
-                        )
-                    }
-                }
                let mut guard = self.timelines_creating.lock().unwrap();
                if !guard.insert(timeline_id) {
                    // We should never try and load the same timeline twice during startup
@@ -1260,26 +1258,7 @@ impl TenantShard {
                    "Timeline has no ancestor and no layer files"
                );

-                match cause {
-                    LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (),
-                    LoadTimelineCause::ImportPgdata {
-                        create_guard,
-                        activate,
-                    } => {
-                        // TODO: see the comment in the task code above how I'm not so certain
-                        // it is safe to activate here because of concurrent shutdowns.
-                        match activate {
-                            ActivateTimelineArgs::Yes { broker_client } => {
-                                info!("activating timeline after reload from pgdata import task");
-                                timeline.activate(self.clone(), broker_client, None, &timeline_ctx);
-                            }
-                            ActivateTimelineArgs::No => (),
-                        }
-                        drop(create_guard);
-                    }
-                }
-
-                Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline))
+                Ok(TimelineInitAndSyncResult::ReadyToActivate)
            }
        }
    }
@@ -1312,6 +1291,7 @@ impl TenantShard {
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
@@ -1327,6 +1307,7 @@ impl TenantShard {
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -1768,7 +1749,7 @@ impl TenantShard {
                })?;

            match effect {
-                TimelineInitAndSyncResult::ReadyToActivate(_) => {
+                TimelineInitAndSyncResult::ReadyToActivate => {
                    // activation happens later, on Tenant::activate
                }
                TimelineInitAndSyncResult::NeedsSpawnImportPgdata(
@@ -1778,13 +1759,24 @@ impl TenantShard {
                        guard,
                    },
                ) => {
-                    tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
-                        timeline,
-                        import_pgdata,
-                        ActivateTimelineArgs::No,
-                        guard,
-                        ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
-                    ));
+                    let timeline_id = timeline.timeline_id;
+                    let import_task_handle =
+                        tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
+                            timeline.clone(),
+                            import_pgdata,
+                            guard,
+                            ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
+                        ));
+
+                    let prev = self.timelines_importing.lock().unwrap().insert(
+                        timeline_id,
+                        ImportingTimeline {
+                            timeline: timeline.clone(),
+                            import_task_handle,
+                        },
+                    );
+
+                    assert!(prev.is_none());
                }
            }
        }
@@ -2678,14 +2670,7 @@ impl TenantShard {
                    .await?
            }
            CreateTimelineParams::ImportPgdata(params) => {
-                self.create_timeline_import_pgdata(
-                    params,
-                    ActivateTimelineArgs::Yes {
-                        broker_client: broker_client.clone(),
-                    },
-                    ctx,
-                )
-                .await?
+                self.create_timeline_import_pgdata(params, ctx).await?
            }
        };

@@ -2759,7 +2744,6 @@ impl TenantShard {
    async fn create_timeline_import_pgdata(
        self: &Arc<Self>,
        params: CreateTimelineParamsImportPgdata,
-        activate: ActivateTimelineArgs,
        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let CreateTimelineParamsImportPgdata {
@@ -2840,24 +2824,71 @@ impl TenantShard {

        let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();

-        tokio::spawn(self.clone().create_timeline_import_pgdata_task(
+        let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task(
            timeline.clone(),
            index_part,
-            activate,
            timeline_create_guard,
            timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
        ));

+        let prev = self.timelines_importing.lock().unwrap().insert(
+            timeline.timeline_id,
+            ImportingTimeline {
+                timeline: timeline.clone(),
+                import_task_handle,
+            },
+        );
+
+        // Idempotency is enforced higher up the stack
+        assert!(prev.is_none());
+
        // NB: the timeline doesn't exist in self.timelines at this point
        Ok(CreateTimelineResult::ImportSpawned(timeline))
    }

+    /// Finalize the import of a timeline on this shard by marking it complete in
+    /// the index part. If the import task hasn't finished yet, returns an error.
+    ///
+    /// This method is idempotent. If the import was finalized once, the next call
+    /// will be a no-op.
+    pub(crate) async fn finalize_importing_timeline(
+        &self,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        let timeline = {
+            let locked = self.timelines_importing.lock().unwrap();
+            match locked.get(&timeline_id) {
+                Some(importing_timeline) => {
+                    if !importing_timeline.import_task_handle.is_finished() {
+                        return Err(anyhow::anyhow!("Import task not done yet"));
+                    }
+
+                    importing_timeline.timeline.clone()
+                }
+                None => {
+                    return Ok(());
+                }
+            }
+        };
+
+        timeline
+            .remote_client
+            .schedule_index_upload_for_import_pgdata_finalize()?;
+        timeline.remote_client.wait_completion().await?;
+
+        self.timelines_importing
+            .lock()
+            .unwrap()
+            .remove(&timeline_id);
+
+        Ok(())
+    }
+
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
    async fn create_timeline_import_pgdata_task(
        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
-        activate: ActivateTimelineArgs,
        timeline_create_guard: TimelineCreateGuard,
        ctx: RequestContext,
    ) {
@@ -2869,7 +2900,6 @@ impl TenantShard {
            .create_timeline_import_pgdata_task_impl(
                timeline,
                index_part,
-                activate,
                timeline_create_guard,
                ctx,
            )
@@ -2885,60 +2915,15 @@ impl TenantShard {
        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
-        activate: ActivateTimelineArgs,
-        timeline_create_guard: TimelineCreateGuard,
+        _timeline_create_guard: TimelineCreateGuard,
        ctx: RequestContext,
    ) -> Result<(), anyhow::Error> {
        info!("importing pgdata");
+        let ctx = ctx.with_scope_timeline(&timeline);
        import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone())
            .await
            .context("import")?;
-        info!("import done");
-
-        //
-        // Reload timeline from remote.
-        // This proves that the remote state is attachable, and it reuses the code.
-        //
-        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
-        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after TenantShard::shutdown
-        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
-        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
-        // down while bootstrapping/branching + activating), but, the race condition is much more likely
-        // to manifest because of the long runtime of this import task.
-
-        //        in theory this shouldn't even .await anything except for coop yield
-        info!("shutting down timeline");
-        timeline.shutdown(ShutdownMode::Hard).await;
-        info!("timeline shut down, reloading from remote");
-        // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc<Timeline>
-        // let Some(timeline) = Arc::into_inner(timeline) else {
-        //     anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere");
-        // };
-        let timeline_id = timeline.timeline_id;
-
-        // load from object storage like TenantShard::attach does
-        let resources = self.build_timeline_resources(timeline_id);
-        let index_part = resources
-            .remote_client
-            .download_index_file(&self.cancel)
-            .await?;
-        let index_part = match index_part {
-            MaybeDeletedIndexPart::Deleted(_) => {
-                // likely concurrent delete call, cplane should prevent this
-                anyhow::bail!(
-                    "index part says deleted but we are not done creating yet, this should not happen but"
-                )
-            }
-            MaybeDeletedIndexPart::IndexPart(p) => p,
-        };
-        let metadata = index_part.metadata.clone();
-        self
-            .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
-                create_guard: timeline_create_guard, activate, }, &ctx)
-            .await?
-            .ready_to_activate()
-            .context("implementation error: reloaded timeline still needs import after import reported success")?;
+        info!("import done - waiting for activation");

        anyhow::Ok(())
    }
@@ -3370,6 +3355,13 @@ impl TenantShard {
                activated_timelines += 1;
            }

+            let tid = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id = self.tenant_shard_id.shard_slug().to_string();
+            let offloaded_timeline_count = timelines_offloaded_accessor.len();
+            TENANT_OFFLOADED_TIMELINES
+                .with_label_values(&[&tid, &shard_id])
+                .set(offloaded_timeline_count as u64);
+
            self.state.send_modify(move |current_state| {
                assert!(
                    matches!(current_state, TenantState::Activating(_)),
@@ -3475,6 +3467,14 @@ impl TenantShard {
                timeline.defuse_for_tenant_drop();
            });
        }
+        {
+            let mut timelines_importing = self.timelines_importing.lock().unwrap();
+            timelines_importing
+                .drain()
+                .for_each(|(_timeline_id, importing_timeline)| {
+                    importing_timeline.shutdown();
+                });
+        }
        // test_long_timeline_create_then_tenant_delete is leaning on this message
        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
@@ -3949,13 +3949,6 @@ where
    Ok(result)
 }

-enum ActivateTimelineArgs {
-    Yes {
-        broker_client: storage_broker::BrokerClientChannel,
-    },
-    No,
-}
-
 impl TenantShard {
    pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
        self.tenant_conf.load().tenant_conf.clone()
@@ -4253,6 +4246,7 @@ impl TenantShard {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
+        basebackup_prepare_sender: BasebackupPrepareSender,
    ) -> TenantShard {
        assert!(!attached_conf.location.generation.is_none());

@@ -4322,6 +4316,7 @@ impl TenantShard {
            timelines: Mutex::new(HashMap::new()),
            timelines_creating: Mutex::new(HashSet::new()),
            timelines_offloaded: Mutex::new(HashMap::new()),
+            timelines_importing: Mutex::new(HashMap::new()),
            remote_tenant_manifest: Default::default(),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
@@ -4355,6 +4350,7 @@ impl TenantShard {
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
            l0_flush_global_state,
+            basebackup_prepare_sender,
        }
    }

@@ -4607,7 +4603,7 @@ impl TenantShard {

            target.cutoffs = GcCutoffs {
                space: space_cutoff,
-                time: Lsn::INVALID,
+                time: None,
            };
        }
    }
@@ -4691,7 +4687,7 @@ impl TenantShard {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
+                            Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
                    }
                }

@@ -4704,13 +4700,15 @@ impl TenantShard {
                    } else {
                        0
                    });
-                timeline.metrics.pitr_history_size.set(
-                    timeline
-                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.time)
-                        .unwrap_or(Lsn(0))
-                        .0,
-                );
+                if let Some(time_cutoff) = target.cutoffs.time {
+                    timeline.metrics.pitr_history_size.set(
+                        timeline
+                            .get_last_record_lsn()
+                            .checked_sub(time_cutoff)
+                            .unwrap_or_default()
+                            .0,
+                    );
+                }

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -4719,8 +4717,8 @@ impl TenantShard {
                    let original_cutoffs = target.cutoffs.clone();
                    // GC cutoffs should never go back
                    target.cutoffs = GcCutoffs {
-                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
-                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                        space: cutoffs.space.max(original_cutoffs.space),
+                        time: cutoffs.time.max(original_cutoffs.time),
                    }
                }
            }
@@ -5272,6 +5270,7 @@ impl TenantShard {
            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
+            basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
        }
    }

@@ -5580,6 +5579,14 @@ impl TenantShard {
            }
        }

+        // Update metrics
+        let tid = self.tenant_shard_id.to_string();
+        let shard_id = self.tenant_shard_id.shard_slug().to_string();
+        let set_key = &[tid.as_str(), shard_id.as_str()][..];
+        TENANT_OFFLOADED_TIMELINES
+            .with_label_values(set_key)
+            .set(manifest.offloaded_timelines.len() as u64);
+
        // Upload the manifest. Remote storage does no retries internally, so retry here.
        match backoff::retry(
            || async {
@@ -5846,6 +5853,8 @@ pub(crate) mod harness {
        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

+            let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
+
            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
@@ -5863,6 +5872,7 @@ pub(crate) mod harness {
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
+                basebackup_requst_sender,
            ));

            let preload = tenant
@@ -8616,8 +8626,10 @@ mod tests {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let io_concurrency =
-            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            tline.conf.get_vectored_concurrent_io,
+            tline.gate.enter().unwrap(),
+        );
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
@@ -8955,7 +8967,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.time = Some(Lsn(0x30));
            guard.cutoffs.space = Lsn(0x30);
        }

@@ -9063,7 +9075,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.time = Some(Lsn(0x40));
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -9481,7 +9493,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -9565,7 +9577,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.time = Some(Lsn(0x40));
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -10036,7 +10048,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10099,7 +10111,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10177,7 +10189,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x38);
+            guard.cutoffs.time = Some(Lsn(0x38));
            guard.cutoffs.space = Lsn(0x38);
        }
        tline
@@ -10285,7 +10297,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10348,7 +10360,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10534,7 +10546,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
+                    time: Some(Lsn(0x10)),
                    space: Lsn(0x10),
                },
                leases: Default::default(),
@@ -10554,7 +10566,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
+                    time: Some(Lsn(0x50)),
                    space: Lsn(0x50),
                },
                leases: Default::default(),
@@ -11275,7 +11287,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11664,7 +11676,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11727,7 +11739,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -11916,7 +11928,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11979,7 +11991,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -12242,7 +12254,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -949,6 +949,35 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// If the `import_pgdata` field marks the timeline as having an import in progress,
+    /// launch an index-file upload operation that transitions it to done in the background
+    pub(crate) fn schedule_index_upload_for_import_pgdata_finalize(
+        self: &Arc<Self>,
+    ) -> anyhow::Result<()> {
+        use import_pgdata::index_part_format;
+
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        let to_update = match &upload_queue.dirty.import_pgdata {
+            Some(import) if !import.is_done() => Some(import),
+            Some(_) | None => None,
+        };
+
+        if let Some(old) = to_update {
+            let new =
+                index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done {
+                    idempotency_key: old.idempotency_key().clone(),
+                    started_at: *old.started_at(),
+                    finished_at: chrono::Utc::now().naive_utc(),
+                }));
+
+            upload_queue.dirty.import_pgdata = Some(new);
+            self.schedule_index_upload(upload_queue);
+        }
+
+        Ok(())
+    }
+
    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
    pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
        self: &Arc<Self>,
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -668,7 +668,9 @@ impl From<DownloadError> for UpdateError {

 impl From<std::io::Error> for UpdateError {
    fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+        if let Some(nix::errno::Errno::ENOSPC) =
+            value.raw_os_error().map(nix::errno::Errno::from_raw)
+        {
            UpdateError::NoSpace
        } else if value
            .get_ref()
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
        // than our internal space cutoff.  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
        // the space cutoff.
-        let mut next_pitr_cutoff = gc_info.cutoffs.time;
+        let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -31,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer;
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
+use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
@@ -43,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
-use crate::config::PageServerConf;
 use crate::context::{
    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
@@ -318,11 +318,10 @@ impl IoConcurrency {
    }

    pub(crate) fn spawn_from_conf(
-        conf: &'static PageServerConf,
+        conf: GetVectoredConcurrentIo,
        gate_guard: GateGuard,
    ) -> IoConcurrency {
-        use pageserver_api::config::GetVectoredConcurrentIo;
-        let selected = match conf.get_vectored_concurrent_io {
+        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
        };
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -63,7 +63,28 @@ pub struct InMemoryLayer {

    opened_at: Instant,

-    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All versions of all pages in the layer are kept here. Indexed
+    /// by block number and LSN. The [`IndexEntry`] is an offset into the
+    /// ephemeral file where the page version is stored.
+    ///
+    /// We use a separate lock for the index to reduce the critical section
+    /// during which reads cannot be planned.
+    ///
+    /// If you need access to both the index and the underlying file at the same time,
+    /// respect the following locking order to avoid deadlocks:
+    /// 1. [`InMemoryLayer::inner`]
+    /// 2. [`InMemoryLayer::index`]
+    ///
+    /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
+    /// so it is not necessary to hold simultaneous locks on index.
+    /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
+    /// In particular:
+    /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
+    /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
+    index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once,
+    /// and `index` (see rationale there).
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,

@@ -81,11 +102,6 @@ impl std::fmt::Debug for InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The [`IndexEntry`] is an offset into the
-    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
-
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
@@ -105,7 +121,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
    trailing_ones
 };

-/// See [`InMemoryLayerInner::index`].
+/// See [`InMemoryLayer::index`].
 ///
 /// For memory efficiency, the data is packed into a u64.
 ///
@@ -425,7 +441,7 @@ impl InMemoryLayer {
            .page_content_kind(PageContentKind::InMemoryLayer)
            .attached_child();

-        let inner = self.inner.read().await;
+        let index = self.index.read().await;

        struct ValueRead {
            entry_lsn: Lsn,
@@ -435,10 +451,7 @@ impl InMemoryLayer {
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner
-                .index
-                .range(range.start.to_compact()..range.end.to_compact())
-            {
+            for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
                let key = Key::from_compact(*key);
                let slice = vec_map.slice_range(lsn_range.clone());

@@ -466,7 +479,7 @@ impl InMemoryLayer {
                }
            }
        }
-        drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
        let read_from = Arc::clone(self);
        let read_ctx = ctx.attached_child();
        reconstruct_state
@@ -573,8 +586,8 @@ impl InMemoryLayer {
            start_lsn,
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
+            index: RwLock::new(BTreeMap::new()),
            inner: RwLock::new(InMemoryLayerInner {
-                index: BTreeMap::new(),
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
@@ -592,31 +605,39 @@ impl InMemoryLayer {
        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
+        let (base_offset, metadata) = {
+            let mut inner = self.inner.write().await;
+            self.assert_writable();

-        let base_offset = inner.file.len();
+            let base_offset = inner.file.len();

-        let SerializedValueBatch {
-            raw,
-            metadata,
-            max_lsn: _,
-            len: _,
-        } = serialized_batch;
+            let SerializedValueBatch {
+                raw,
+                metadata,
+                max_lsn: _,
+                len: _,
+            } = serialized_batch;

-        // Write the batch to the file
-        inner.file.write_raw(&raw, ctx).await?;
-        let new_size = inner.file.len();
+            // Write the batch to the file
+            inner.file.write_raw(&raw, ctx).await?;
+            let new_size = inner.file.len();

-        let expected_new_len = base_offset
-            .checked_add(raw.len().into_u64())
-            // write_raw would error if we were to overflow u64.
-            // also IndexEntry and higher levels in
-            //the code don't allow the file to grow that large
-            .unwrap();
-        assert_eq!(new_size, expected_new_len);
+            let expected_new_len = base_offset
+                .checked_add(raw.len().into_u64())
+                // write_raw would error if we were to overflow u64.
+                // also IndexEntry and higher levels in
+                //the code don't allow the file to grow that large
+                .unwrap();
+            assert_eq!(new_size, expected_new_len);
+
+            inner.resource_units.maybe_publish_size(new_size);
+
+            (base_offset, metadata)
+        };

        // Update the index with the new entries
+        let mut index = self.index.write().await;
+
        for meta in metadata {
            let SerializedValueMeta {
                key,
@@ -639,7 +660,7 @@ impl InMemoryLayer {
                will_init,
            })?;

-            let vec_map = inner.index.entry(key).or_default();
+            let vec_map = index.entry(key).or_default();
            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
            if old.is_some() {
                // This should not break anything, but is unexpected: ingestion code aims to filter out
@@ -658,8 +679,6 @@ impl InMemoryLayer {
            );
        }

-        inner.resource_units.maybe_publish_size(new_size);
-
        Ok(())
    }

@@ -680,6 +699,18 @@ impl InMemoryLayer {

    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
+    ///
+    /// A note on locking:
+    /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
+    /// writes while freezing the layer. This is enforced at a higher level via
+    /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
+    /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
+    ///    Timeline::write_lock for its lifetime. The rolling is handled in
+    ///    [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
+    ///    so can't be called from different threads.
+    /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
+    ///    This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
+    ///    hence there can be no concurrent writes
    pub async fn freeze(&self, end_lsn: Lsn) {
        assert!(
            self.start_lsn < end_lsn,
@@ -700,8 +731,8 @@ impl InMemoryLayer {

        #[cfg(debug_assertions)]
        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
+            let index = self.index.read().await;
+            for vec_map in index.values() {
                for (lsn, _) in vec_map.as_slice() {
                    assert!(*lsn < end_lsn);
                }
@@ -724,14 +755,11 @@ impl InMemoryLayer {
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
-        // write lock on it, so we shouldn't block anyone. There's one exception
-        // though: another thread might have grabbed a reference to this layer
-        // in `get_layer_for_write' just before the checkpointer called
-        // `freeze`, and then `write_to_disk` on it. When the thread gets the
-        // lock, it will see that it's not writeable anymore and retry, but it
-        // would have to wait until we release it. That race condition is very
-        // rare though, so we just accept the potential latency hit for now.
+        // write lock on it, so we shouldn't block anyone. See the comment on
+        // [`InMemoryLayer::freeze`] to understand how locking between the append path
+        // and layer flushing works.
        let inner = self.inner.read().await;
+        let index = self.index.read().await;

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
@@ -743,13 +771,9 @@ impl InMemoryLayer {
        let key_count = if let Some(key_range) = key_range {
            let key_range = key_range.start.to_compact()..key_range.end.to_compact();

-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .count()
+            index.iter().filter(|(k, _)| key_range.contains(k)).count()
        } else {
-            inner.index.len()
+            index.len()
        };
        if key_count == 0 {
            return Ok(None);
@@ -772,7 +796,7 @@ impl InMemoryLayer {
                let file_contents = inner.file.load_to_io_buf(ctx).await?;
                let file_contents = file_contents.freeze();

-                for (key, vec_map) in inner.index.iter() {
+                for (key, vec_map) in index.iter() {
                    // Write all page versions
                    for (lsn, entry) in vec_map
                        .as_slice()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ pub mod span;
 pub mod uninit;
 mod walreceiver;

+use hashlink::LruCache;
 use std::array;
 use std::cmp::{max, min};
 use std::collections::btree_map::Entry;
@@ -23,8 +24,6 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::PERF_TRACE_TARGET;
-use crate::walredo::RedoAttemptType;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -93,10 +92,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
 use super::upload_queue::NotInitialized;
 use super::{
-    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
+    AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
+use crate::PERF_TRACE_TARGET;
 use crate::aux_file::AuxFileSizeEstimator;
+use crate::basebackup_cache::BasebackupPrepareRequest;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -130,6 +131,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use crate::walingest::WalLagCooldown;
+use crate::walredo::RedoAttemptType;
 use crate::{ZERO_PAGE, task_mgr, walredo};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -195,16 +197,7 @@ pub struct TimelineResources {
    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
    pub l0_compaction_trigger: Arc<Notify>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
-}
-
-/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
-/// ingestion considerably, because WAL ingestion needs to check on most records if the record
-/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
-/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
-/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
-pub(crate) struct RelSizeCache {
-    pub(crate) complete_as_of: Lsn,
-    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+    pub basebackup_prepare_sender: BasebackupPrepareSender,
 }

 pub struct Timeline {
@@ -365,7 +358,8 @@ pub struct Timeline {
    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
-    pub(crate) rel_size_cache: RwLock<RelSizeCache>,
+    pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,

    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

@@ -447,6 +441,9 @@ pub struct Timeline {
    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,

    wait_lsn_log_slow: tokio::sync::Semaphore,
+
+    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
+    basebackup_prepare_sender: BasebackupPrepareSender,
 }

 pub(crate) enum PreviousHeatmap {
@@ -537,29 +534,24 @@ impl GcInfo {
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
 /// is a single number (the oldest LSN which we must retain), but it internally distinguishes
 /// between time-based and space-based retention for observability and consumption metrics purposes.
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug, Default)]
 pub(crate) struct GcCutoffs {
    /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
    /// history we must keep to retain a specified number of bytes of WAL.
    pub(crate) space: Lsn,

-    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
-    /// history we must keep to enable reading back at least the PITR interval duration.
-    pub(crate) time: Lsn,
-}
-
-impl Default for GcCutoffs {
-    fn default() -> Self {
-        Self {
-            space: Lsn::INVALID,
-            time: Lsn::INVALID,
-        }
-    }
+    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
+    /// how much history we must keep to enable reading back at least the PITR interval duration.
+    ///
+    /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
+    /// Some(last_record_lsn).
+    pub(crate) time: Option<Lsn>,
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.space, self.time)
+        // NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
+        self.space.min(self.time.unwrap_or_default())
    }
 }

@@ -1041,6 +1033,7 @@ pub(crate) enum WaitLsnWaiter<'a> {
    Tenant,
    PageService,
    HttpEndpoint,
+    BaseBackupCache,
 }

 /// Argument to [`Timeline::shutdown`].
@@ -1096,11 +1089,14 @@ impl Timeline {
    /// Get the bytes written since the PITR cutoff on this branch, and
    /// whether this branch's ancestor_lsn is within its parent's PITR.
    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
+        // TODO: for backwards compatibility, we return the full history back to 0 when the PITR
+        // cutoff has not yet been initialized. This should return None instead, but this is exposed
+        // in external HTTP APIs and callers may not handle a null value.
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.time)
-            .unwrap_or(Lsn(0))
+            .checked_sub(gc_info.cutoffs.time.unwrap_or_default())
+            .unwrap_or_default()
            .0;
        (history, gc_info.within_ancestor_pitr)
    }
@@ -1110,9 +1106,10 @@ impl Timeline {
        self.applied_gc_cutoff_lsn.read()
    }

-    /// Read timeline's planned GC cutoff: this is the logical end of history that users
-    /// are allowed to read (based on configured PITR), even if physically we have more history.
-    pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
+    /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
+    /// to read (based on configured PITR), even if physically we have more history. Returns None
+    /// if the PITR cutoff has not yet been initialized.
+    pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
        self.gc_info.read().unwrap().cutoffs.time
    }

@@ -1563,7 +1560,8 @@ impl Timeline {
                        }
                        WaitLsnWaiter::Tenant
                        | WaitLsnWaiter::PageService
-                        | WaitLsnWaiter::HttpEndpoint => unreachable!(
+                        | WaitLsnWaiter::HttpEndpoint
+                        | WaitLsnWaiter::BaseBackupCache => unreachable!(
                            "tenant or page_service context are not expected to have task kind {:?}",
                            ctx.task_kind()
                        ),
@@ -2468,6 +2466,41 @@ impl Timeline {
            false
        }
    }
+
+    pub(crate) fn is_basebackup_cache_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .basebackup_cache_enabled
+            .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
+    }
+
+    /// Prepare basebackup for the given LSN and store it in the basebackup cache.
+    /// The method is asynchronous and returns immediately.
+    /// The actual basebackup preparation is performed in the background
+    /// by the basebackup cache on a best-effort basis.
+    pub(crate) fn prepare_basebackup(&self, lsn: Lsn) {
+        if !self.is_basebackup_cache_enabled() {
+            return;
+        }
+        if !self.tenant_shard_id.is_shard_zero() {
+            // In theory we should never get here, but just in case check it.
+            // Preparing basebackup doesn't make sense for shards other than shard zero.
+            return;
+        }
+
+        let res = self
+            .basebackup_prepare_sender
+            .send(BasebackupPrepareRequest {
+                tenant_shard_id: self.tenant_shard_id,
+                timeline_id: self.timeline_id,
+                lsn,
+            });
+        if let Err(e) = res {
+            // May happen during shutdown, it's not critical.
+            info!("Failed to send shutdown checkpoint: {e:#}");
+        }
+    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -2545,6 +2578,13 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

+    pub(crate) fn get_pitr_interval(&self) -> Duration {
+        let tenant_conf = &self.tenant_conf.load().tenant_conf;
+        tenant_conf
+            .pitr_interval
+            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
+    }
+
    fn get_compaction_period(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2820,6 +2860,13 @@ impl Timeline {

            self.remote_client.update_config(&new_conf.location);

+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
+                if new_capacity != rel_size_cache.capacity() {
+                    rel_size_cache.set_capacity(new_capacity);
+                }
+            }
+
            self.metrics
                .evictions_with_low_residence_duration
                .write()
@@ -2878,6 +2925,14 @@ impl Timeline {
            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
        }

+        let relsize_snapshot_cache_capacity = {
+            let loaded_tenant_conf = tenant_conf.load();
+            loaded_tenant_conf
+                .tenant_conf
+                .relsize_snapshot_cache_capacity
+                .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
+        };
+
        Arc::new_cyclic(|myself| {
            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
@@ -2969,10 +3024,8 @@ impl Timeline {
                last_image_layer_creation_check_instant: Mutex::new(None),

                last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(RelSizeCache {
-                    complete_as_of: disk_consistent_lsn,
-                    map: HashMap::new(),
-                }),
+                rel_size_latest_cache: RwLock::new(HashMap::new()),
+                rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),

                download_all_remote_layers_task_info: RwLock::new(None),

@@ -3017,6 +3070,8 @@ impl Timeline {
                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),

                wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
+
+                basebackup_prepare_sender: resources.basebackup_prepare_sender,
            };

            result.repartition_threshold =
@@ -3530,7 +3585,7 @@ impl Timeline {
                };

                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    self_ref.conf,
+                    self_ref.conf.get_vectored_concurrent_io,
                    self_ref
                        .gate
                        .enter()
@@ -5559,7 +5614,7 @@ impl Timeline {
            });

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
@@ -6230,14 +6285,12 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        if cfg!(test) {
+        if cfg!(test) && pitr == Duration::ZERO {
            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
-            if pitr == Duration::ZERO {
-                return Ok(GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                });
-            }
+            return Ok(GcCutoffs {
+                time: Some(self.get_last_record_lsn()),
+                space: space_cutoff,
+            });
        }

        // Calculate a time-based limit on how much to retain:
@@ -6251,14 +6304,14 @@ impl Timeline {
                // PITR is not set. Retain the size-based limit, or the default time retention,
                // whichever requires less data.
                GcCutoffs {
-                    time: self.get_last_record_lsn(),
+                    time: Some(self.get_last_record_lsn()),
                    space: std::cmp::max(time_cutoff, space_cutoff),
                }
            }
            (Duration::ZERO, None) => {
                // PITR is not set, and time lookup failed
                GcCutoffs {
-                    time: self.get_last_record_lsn(),
+                    time: Some(self.get_last_record_lsn()),
                    space: space_cutoff,
                }
            }
@@ -6266,7 +6319,7 @@ impl Timeline {
                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                // cannot advance beyond what was already GC'd, and respect space-based retention
                GcCutoffs {
-                    time: *self.get_applied_gc_cutoff_lsn(),
+                    time: Some(*self.get_applied_gc_cutoff_lsn()),
                    space: space_cutoff,
                }
            }
@@ -6274,7 +6327,7 @@ impl Timeline {
                // PITR interval is set and we looked up timestamp successfully.  Ignore
                // size based retention and make time cutoff authoritative
                GcCutoffs {
-                    time: time_cutoff,
+                    time: Some(time_cutoff),
                    space: time_cutoff,
                }
            }
@@ -6327,7 +6380,7 @@ impl Timeline {
            )
        };

-        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
+        let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -6376,7 +6429,7 @@ impl Timeline {
    async fn gc_timeline(
        &self,
        space_cutoff: Lsn,
-        time_cutoff: Lsn,
+        time_cutoff: Option<Lsn>, // None if uninitialized
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -6395,6 +6448,12 @@ impl Timeline {
            return Ok(result);
        }

+        let Some(time_cutoff) = time_cutoff else {
+            // The GC cutoff should have been computed by now, but let's be defensive.
+            info!("Nothing to GC: time_cutoff not yet computed");
+            return Ok(result);
+        };
+
        // We need to ensure that no one tries to read page versions or create
        // branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
        // for details. This will block until the old value is no longer in use.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1526,7 +1526,7 @@ impl Timeline {
        info!(
            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
                checked {layers_checked}/{layers_total} layers \
-                (latest_gc_cutoff={} pitr_cutoff={})",
+                (latest_gc_cutoff={} pitr_cutoff={:?})",
            layers_to_rewrite.len(),
            drop_layers.len(),
            *latest_gc_cutoff,
@@ -3435,6 +3435,7 @@ impl Timeline {

        // Step 2: Produce images+deltas.
        let mut accumulated_values = Vec::new();
+        let mut accumulated_values_estimated_size = 0;
        let mut last_key: Option<Key> = None;

        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
@@ -3611,12 +3612,16 @@ impl Timeline {
                if last_key.is_none() {
                    last_key = Some(key);
                }
+                accumulated_values_estimated_size += val.estimated_size();
                accumulated_values.push((key, lsn, val));

-                if accumulated_values.len() >= 65536 {
-                    // Assume all of them are images, that would be 512MB of data in memory for a single key.
+                // Accumulated values should never exceed 512MB.
+                if accumulated_values_estimated_size >= 1024 * 1024 * 512 {
                    return Err(CompactionError::Other(anyhow!(
-                        "too many values for a single key, giving up gc-compaction"
+                        "too many values for a single key: {} for key {}, {} items",
+                        accumulated_values_estimated_size,
+                        key,
+                        accumulated_values.len()
                    )));
                }
            } else {
@@ -3651,6 +3656,7 @@ impl Timeline {
                    .map_err(CompactionError::Other)?;
                accumulated_values.clear();
                *last_key = key;
+                accumulated_values_estimated_size = val.estimated_size();
                accumulated_values.push((key, lsn, val));
            }
        }
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
    );
    let io_concurrency = IoConcurrency::spawn_from_conf(
-        detached.conf,
+        detached.conf.get_vectored_concurrent_io,
        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
    );
    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -1,8 +1,10 @@
 use std::sync::Arc;

 use anyhow::{Context, bail};
+use importbucket_client::{ControlFile, RemoteStorageWrapper};
 use pageserver_api::models::ShardImportStatus;
 use remote_storage::RemotePath;
+use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use utils::lsn::Lsn;
@@ -17,6 +19,17 @@ mod importbucket_client;
 mod importbucket_format;
 pub(crate) mod index_part_format;

+pub(crate) struct ImportingTimeline {
+    pub import_task_handle: JoinHandle<()>,
+    pub timeline: Arc<Timeline>,
+}
+
+impl ImportingTimeline {
+    pub(crate) fn shutdown(self) {
+        self.import_task_handle.abort();
+    }
+}
+
 pub async fn doit(
    timeline: &Arc<Timeline>,
    index_part: index_part_format::Root,
@@ -26,173 +39,225 @@ pub async fn doit(
    let index_part_format::Root::V1(v1) = index_part;
    let index_part_format::InProgress {
        location,
-        idempotency_key,
-        started_at,
+        idempotency_key: _,
+        started_at: _,
    } = match v1 {
        index_part_format::V1::Done(_) => return Ok(()),
        index_part_format::V1::InProgress(in_progress) => in_progress,
    };

-    let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
+    let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);

-    let status_prefix = RemotePath::from_string("status").unwrap();
+    let shard_status = storcon_client
+        .get_timeline_import_status(
+            timeline.tenant_shard_id,
+            timeline.timeline_id,
+            timeline.generation,
+        )
+        .await
+        .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;

-    //
-    // See if shard is done.
-    // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing.
-    //
-    let shard_status_key =
-        status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug()));
-    let shard_status: Option<importbucket_format::ShardStatus> =
-        storage.get_json(&shard_status_key).await?;
    info!(?shard_status, "peeking shard status");
-    if shard_status.map(|st| st.done).unwrap_or(false) {
-        info!("shard status indicates that the shard is done, skipping import");
-    } else {
-        // TODO: checkpoint the progress into the IndexPart instead of restarting
-        // from the beginning.
+    match shard_status {
+        ShardImportStatus::InProgress(maybe_progress) => {
+            let storage =
+                importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;

-        //
-        // Wipe the slate clean - the flow does not allow resuming.
-        // We can implement resuming in the future by checkpointing the progress into the IndexPart.
-        //
-        info!("wipe the slate clean");
-        {
-            // TODO: do we need to hold GC lock for this?
-            let mut guard = timeline.layers.write().await;
-            assert!(
-                guard.layer_map()?.open_layer.is_none(),
-                "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
-            );
-            let all_layers_keys = guard.all_persistent_layers();
-            let all_layers: Vec<_> = all_layers_keys
-                .iter()
-                .map(|key| guard.get_from_key(key))
-                .collect();
-            let open = guard.open_mut().context("open_mut")?;
+            let control_file_res = if maybe_progress.is_none() {
+                // Only prepare the import once when there's no progress.
+                prepare_import(timeline, storage.clone(), &cancel).await
+            } else {
+                storage.get_control_file().await
+            };

-            timeline.remote_client.schedule_gc_update(&all_layers)?;
-            open.finish_gc_timeline(&all_layers);
-        }
-
-        //
-        // Wait for pgdata to finish uploading
-        //
-        info!("wait for pgdata to reach status 'done'");
-        let pgdata_status_key = status_prefix.join("pgdata");
-        loop {
-            let res = async {
-                let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
-                    .get_json(&pgdata_status_key)
-                    .await
-                    .context("get pgdata status")?;
-                info!(?pgdata_status, "peeking pgdata status");
-                if pgdata_status.map(|st| st.done).unwrap_or(false) {
-                    Ok(())
-                } else {
-                    Err(anyhow::anyhow!("pgdata not done yet"))
-                }
-            }
-            .await;
-            match res {
-                Ok(_) => break,
+            let control_file = match control_file_res {
+                Ok(cf) => cf,
                Err(err) => {
-                    info!(?err, "indefinitely waiting for pgdata to finish");
-                    if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
-                        .await
-                        .is_ok()
-                    {
-                        bail!("cancelled while waiting for pgdata");
-                    }
+                    return Err(
+                        terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
+                    );
                }
-            }
-        }
+            };

-        //
-        // Do the import
-        //
-        info!("do the import");
-        let control_file = storage.get_control_file().await?;
-        let base_lsn = control_file.base_lsn();
-
-        info!("update TimelineMetadata based on LSNs from control file");
-        {
-            let pg_version = control_file.pg_version();
-            let _ctx: &RequestContext = ctx;
-            async move {
-                // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
-                // checkpoint record, and prev_record_lsn should point to its beginning.
-                // We should read the real end of the record from the WAL, but here we
-                // just fake it.
-                let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
-                let prev_record_lsn = base_lsn;
-                let metadata = TimelineMetadata::new(
-                    disk_consistent_lsn,
-                    Some(prev_record_lsn),
-                    None,     // no ancestor
-                    Lsn(0),   // no ancestor lsn
-                    base_lsn, // latest_gc_cutoff_lsn
-                    base_lsn, // initdb_lsn
-                    pg_version,
+            let res = flow::run(
+                timeline.clone(),
+                control_file,
+                storage.clone(),
+                maybe_progress,
+                ctx,
+            )
+            .await;
+            if let Err(err) = res {
+                return Err(
+                    terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
                );
-
-                let _start_lsn = disk_consistent_lsn + 1;
-
-                timeline
-                    .remote_client
-                    .schedule_index_upload_for_full_metadata_update(&metadata)?;
-
-                timeline.remote_client.wait_completion().await?;
-
-                anyhow::Ok(())
            }
+
+            // Communicate that shard is done.
+            // Ensure at-least-once delivery of the upcall to storage controller
+            // before we mark the task as done and never come here again.
+            //
+            // Note that we do not mark the import complete in the index part now.
+            // This happens in [`Tenant::finalize_importing_timeline`] in response
+            // to the storage controller calling
+            // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`.
+            storcon_client
+                .put_timeline_import_status(
+                    timeline.tenant_shard_id,
+                    timeline.timeline_id,
+                    timeline.generation,
+                    ShardImportStatus::Done,
+                )
+                .await
+                .map_err(|_err| {
+                    anyhow::anyhow!("Shut down while putting timeline import status")
+                })?;
+        }
+        ShardImportStatus::Error(err) => {
+            info!(
+                "shard status indicates that the shard is done (error), skipping import {}",
+                err
+            );
+        }
+        ShardImportStatus::Done => {
+            info!("shard status indicates that the shard is done (success), skipping import");
        }
-        .await?;
-
-        flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
-
-        //
-        // Communicate that shard is done.
-        // Ensure at-least-once delivery of the upcall to storage controller
-        // before we mark the task as done and never come here again.
-        //
-        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);
-        storcon_client
-            .put_timeline_import_status(
-                timeline.tenant_shard_id,
-                timeline.timeline_id,
-                // TODO(vlad): What about import errors?
-                ShardImportStatus::Done,
-            )
-            .await
-            .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?;
-
-        storage
-            .put_json(
-                &shard_status_key,
-                &importbucket_format::ShardStatus { done: true },
-            )
-            .await
-            .context("put shard status")?;
    }

-    //
-    // Mark as done in index_part.
-    // This makes subsequent timeline loads enter the normal load code path
-    // instead of spawning the import task and calling this here function.
-    //
-    info!("mark import as complete in index part");
-    timeline
-        .remote_client
-        .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1(
-            index_part_format::V1::Done(index_part_format::Done {
-                idempotency_key,
-                started_at,
-                finished_at: chrono::Utc::now().naive_utc(),
-            }),
-        )))?;
-
-    timeline.remote_client.wait_completion().await?;
-
    Ok(())
 }
+
+async fn prepare_import(
+    timeline: &Arc<Timeline>,
+    storage: RemoteStorageWrapper,
+    cancel: &CancellationToken,
+) -> anyhow::Result<ControlFile> {
+    // Wipe the slate clean before starting the import as a precaution.
+    // This method is only called when there's no recorded checkpoint for the import
+    // in the storage controller.
+    //
+    // Note that this is split-brain safe (two imports for same timeline shards running in
+    // different generations) because we go through the usual deletion path, including deletion queue.
+    info!("wipe the slate clean");
+    {
+        // TODO: do we need to hold GC lock for this?
+        let mut guard = timeline.layers.write().await;
+        assert!(
+            guard.layer_map()?.open_layer.is_none(),
+            "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+        );
+        let all_layers_keys = guard.all_persistent_layers();
+        let all_layers: Vec<_> = all_layers_keys
+            .iter()
+            .map(|key| guard.get_from_key(key))
+            .collect();
+        let open = guard.open_mut().context("open_mut")?;
+
+        timeline.remote_client.schedule_gc_update(&all_layers)?;
+        open.finish_gc_timeline(&all_layers);
+    }
+
+    //
+    // Wait for pgdata to finish uploading
+    //
+    info!("wait for pgdata to reach status 'done'");
+    let status_prefix = RemotePath::from_string("status").unwrap();
+    let pgdata_status_key = status_prefix.join("pgdata");
+    loop {
+        let res = async {
+            let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                .get_json(&pgdata_status_key)
+                .await
+                .context("get pgdata status")?;
+            info!(?pgdata_status, "peeking pgdata status");
+            if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                Ok(())
+            } else {
+                Err(anyhow::anyhow!("pgdata not done yet"))
+            }
+        }
+        .await;
+        match res {
+            Ok(_) => break,
+            Err(err) => {
+                info!(?err, "indefinitely waiting for pgdata to finish");
+                if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+                    .await
+                    .is_ok()
+                {
+                    bail!("cancelled while waiting for pgdata");
+                }
+            }
+        }
+    }
+
+    let control_file = storage.get_control_file().await?;
+    let base_lsn = control_file.base_lsn();
+
+    info!("update TimelineMetadata based on LSNs from control file");
+    {
+        let pg_version = control_file.pg_version();
+        async move {
+            // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+            // checkpoint record, and prev_record_lsn should point to its beginning.
+            // We should read the real end of the record from the WAL, but here we
+            // just fake it.
+            let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+            let prev_record_lsn = base_lsn;
+            let metadata = TimelineMetadata::new(
+                disk_consistent_lsn,
+                Some(prev_record_lsn),
+                None,     // no ancestor
+                Lsn(0),   // no ancestor lsn
+                base_lsn, // latest_gc_cutoff_lsn
+                base_lsn, // initdb_lsn
+                pg_version,
+            );
+
+            let _start_lsn = disk_consistent_lsn + 1;
+
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline.remote_client.wait_completion().await?;
+
+            anyhow::Ok(())
+        }
+    }
+    .await?;
+
+    Ok(control_file)
+}
+
+async fn terminate_flow_with_error(
+    timeline: &Arc<Timeline>,
+    error: anyhow::Error,
+    storcon_client: &StorageControllerUpcallClient,
+    cancel: &CancellationToken,
+) -> anyhow::Error {
+    // The import task is a aborted on tenant shutdown, so in principle, it should
+    // never be cancelled. To be on the safe side, check the cancellation tokens
+    // before marking the import as failed.
+    if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) {
+        let notify_res = storcon_client
+            .put_timeline_import_status(
+                timeline.tenant_shard_id,
+                timeline.timeline_id,
+                timeline.generation,
+                ShardImportStatus::Error(format!("{error:#}")),
+            )
+            .await;
+
+        if let Err(_notify_error) = notify_res {
+            // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries
+            // forever internally, so errors returned by it can only be due to cancellation.
+            info!("failed to notify storcon about permanent import error");
+        }
+
+        // Will be logged by [`Tenant::create_timeline_import_pgdata_task`]
+        error
+    } else {
+        anyhow::anyhow!("Import task cancelled")
+    }
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -29,10 +29,11 @@
 //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)

 use std::collections::HashSet;
+use std::hash::{Hash, Hasher};
 use std::ops::Range;
 use std::sync::Arc;

-use anyhow::{bail, ensure};
+use anyhow::ensure;
 use bytes::Bytes;
 use futures::stream::FuturesOrdered;
 use itertools::Itertools;
@@ -43,6 +44,7 @@ use pageserver_api::key::{
    slru_segment_size_to_key,
 };
 use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
+use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
@@ -53,21 +55,42 @@ use tokio_stream::StreamExt;
 use tracing::{debug, instrument};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
+use utils::pausable_failpoint;

 use super::Timeline;
 use super::importbucket_client::{ControlFile, RemoteStorageWrapper};
 use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::pgdatadir_mapping::{
    DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
+use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};

 pub async fn run(
    timeline: Arc<Timeline>,
    control_file: ControlFile,
    storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgress>,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    // Match how we run the import based on the progress version.
+    // If there's no import progress, it means that this is a new import
+    // and we can use whichever version we want.
+    match import_progress {
+        Some(ShardImportProgress::V1(progress)) => {
+            run_v1(timeline, control_file, storage, Some(progress), ctx).await
+        }
+        None => run_v1(timeline, control_file, storage, None, ctx).await,
+    }
+}
+
+async fn run_v1(
+    timeline: Arc<Timeline>,
+    control_file: ControlFile,
+    storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgressV1>,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    let planner = Planner {
@@ -79,7 +102,32 @@ pub async fn run(

    let import_config = &timeline.conf.timeline_import_config;
    let plan = planner.plan(import_config).await?;
-    plan.execute(timeline, import_config, ctx).await
+
+    // Hash the plan and compare with the hash of the plan we got back from the storage controller.
+    // If the two match, it means that the planning stage had the same output.
+    //
+    // This is not intended to be a cryptographically secure hash.
+    const SEED: u64 = 42;
+    let mut hasher = twox_hash::XxHash64::with_seed(SEED);
+    plan.hash(&mut hasher);
+    let plan_hash = hasher.finish();
+
+    if let Some(progress) = &import_progress {
+        if plan_hash != progress.import_plan_hash {
+            anyhow::bail!("Import plan does not match storcon metadata");
+        }
+
+        // Handle collisions on jobs of unequal length
+        if progress.jobs != plan.jobs.len() {
+            anyhow::bail!("Import plan job length does not match storcon metadata")
+        }
+    }
+
+    pausable_failpoint!("import-timeline-pre-execute-pausable");
+
+    let start_from_job_idx = import_progress.map(|progress| progress.completed);
+    plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx)
+        .await
 }

 struct Planner {
@@ -89,8 +137,11 @@ struct Planner {
    tasks: Vec<AnyImportTask>,
 }

+#[derive(Hash)]
 struct Plan {
    jobs: Vec<ChunkProcessingJob>,
+    // Included here such that it ends up in the hash for the plan
+    shard: ShardIdentity,
 }

 impl Planner {
@@ -194,7 +245,10 @@ impl Planner {
            pgdata_lsn,
        ));

-        Ok(Plan { jobs })
+        Ok(Plan {
+            jobs,
+            shard: self.shard,
+        })
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
@@ -323,25 +377,45 @@ impl Plan {
    async fn execute(
        self,
        timeline: Arc<Timeline>,
+        start_after_job_idx: Option<usize>,
+        import_plan_hash: u64,
        import_config: &TimelineImportConfig,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel);
+
        let mut work = FuturesOrdered::new();
        let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into()));

        let jobs_in_plan = self.jobs.len();

-        let mut jobs = self.jobs.into_iter().enumerate().peekable();
-        let mut results = Vec::new();
+        let mut jobs = self
+            .jobs
+            .into_iter()
+            .enumerate()
+            .map(|(idx, job)| (idx + 1, job))
+            .filter(|(idx, _job)| {
+                // Filter out any jobs that have been done already
+                if let Some(start_after) = start_after_job_idx {
+                    *idx > start_after
+                } else {
+                    true
+                }
+            })
+            .peekable();
+
+        let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
+        let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();

        // Run import jobs concurrently up to the limit specified by the pageserver configuration.
        // Note that we process completed futures in the oreder of insertion. This will be the
        // building block for resuming imports across pageserver restarts or tenant migrations.
-        while results.len() < jobs_in_plan {
+        while last_completed_job_idx < jobs_in_plan {
            tokio::select! {
                permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => {
                    let permit = permit.expect("never closed");
                    let (job_idx, job) = jobs.next().expect("we peeked");
+
                    let job_timeline = timeline.clone();
                    let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);

@@ -353,13 +427,35 @@ impl Plan {
                },
                maybe_complete_job_idx = work.next() => {
                    match maybe_complete_job_idx {
-                        Some(Ok((_job_idx, res))) => {
-                            results.push(res);
+                        Some(Ok((job_idx, res))) => {
+                            assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx);
+
+                            res?;
+                            last_completed_job_idx = job_idx;
+
+                            if last_completed_job_idx % checkpoint_every == 0 {
+                                let progress = ShardImportProgressV1 {
+                                    jobs: jobs_in_plan,
+                                    completed: last_completed_job_idx,
+                                    import_plan_hash,
+                                };
+
+                                storcon_client.put_timeline_import_status(
+                                    timeline.tenant_shard_id,
+                                    timeline.timeline_id,
+                                    timeline.generation,
+                                    ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress)))
+                                )
+                                .await
+                                .map_err(|_err| {
+                                    anyhow::anyhow!("Shut down while putting timeline import status")
+                                })?;
+                            }
                        },
                        Some(Err(_)) => {
-                            results.push(Err(anyhow::anyhow!(
-                                "parallel job panicked or cancelled, check pageserver logs"
-                            )));
+                            anyhow::bail!(
+                                "import job panicked or cancelled"
+                            );
                        }
                        None => {}
                    }
@@ -367,17 +463,7 @@ impl Plan {
            }
        }

-        if results.iter().all(|r| r.is_ok()) {
-            Ok(())
-        } else {
-            let mut msg = String::new();
-            for result in results {
-                if let Err(err) = result {
-                    msg.push_str(&format!("{err:?}\n\n"));
-                }
-            }
-            bail!("Some parallel jobs failed:\n\n{msg}");
-        }
+        Ok(())
    }
 }

@@ -549,6 +635,15 @@ struct ImportSingleKeyTask {
    buf: Bytes,
 }

+impl Hash for ImportSingleKeyTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSingleKeyTask { key, buf } = self;
+
+        key.hash(state);
+        buf.hash(state);
+    }
+}
+
 impl ImportSingleKeyTask {
    fn new(key: Key, buf: Bytes) -> Self {
        ImportSingleKeyTask { key, buf }
@@ -577,6 +672,20 @@ struct ImportRelBlocksTask {
    storage: RemoteStorageWrapper,
 }

+impl Hash for ImportRelBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportRelBlocksTask {
+            shard_identity: _,
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportRelBlocksTask {
    fn new(
        shard_identity: ShardIdentity,
@@ -661,6 +770,19 @@ struct ImportSlruBlocksTask {
    storage: RemoteStorageWrapper,
 }

+impl Hash for ImportSlruBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSlruBlocksTask {
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportSlruBlocksTask {
    fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
        ImportSlruBlocksTask {
@@ -703,6 +825,7 @@ impl ImportTask for ImportSlruBlocksTask {
    }
 }

+#[derive(Hash)]
 enum AnyImportTask {
    SingleKey(ImportSingleKeyTask),
    RelBlocks(ImportRelBlocksTask),
@@ -749,6 +872,7 @@ impl From<ImportSlruBlocksTask> for AnyImportTask {
    }
 }

+#[derive(Hash)]
 struct ChunkProcessingJob {
    range: Range<Key>,
    tasks: Vec<AnyImportTask>,
@@ -786,17 +910,51 @@ impl ChunkProcessingJob {

        let resident_layer = if nimages > 0 {
            let (desc, path) = writer.finish(ctx).await?;
+
+            {
+                let guard = timeline.layers.read().await;
+                let existing_layer = guard.try_get_from_key(&desc.key());
+                if let Some(layer) = existing_layer {
+                    if layer.metadata().generation != timeline.generation {
+                        return Err(anyhow::anyhow!(
+                            "Import attempted to rewrite layer file in the same generation: {}",
+                            layer.local_path()
+                        ));
+                    }
+                }
+            }
+
            Layer::finish_creating(timeline.conf, &timeline, desc, &path)?
        } else {
            // dropping the writer cleans up
            return Ok(());
        };

-        // this is sharing the same code as create_image_layers
+        // The same import job might run multiple times since not each job is checkpointed.
+        // Hence, we must support the cases where the layer already exists. We cannot be
+        // certain that the existing layer is identical to the new one, so in that case
+        // we replace the old layer with the one we just generated.
+
        let mut guard = timeline.layers.write().await;
-        guard
-            .open_mut()?
-            .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+
+        let existing_layer = guard
+            .try_get_from_key(&resident_layer.layer_desc().key())
+            .cloned();
+        match existing_layer {
+            Some(existing) => {
+                guard.open_mut()?.rewrite_layers(
+                    &[(existing.clone(), resident_layer.clone())],
+                    &[],
+                    &timeline.metrics,
+                );
+            }
+            None => {
+                guard
+                    .open_mut()?
+                    .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+            }
+        }
+
        crate::tenant::timeline::drop_wlock(guard);

        timeline
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -190,31 +190,6 @@ impl RemoteStorageWrapper {
        Ok(Some(res))
    }

-    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
-    pub async fn put_json<T>(&self, path: &RemotePath, value: &T) -> anyhow::Result<()>
-    where
-        T: serde::Serialize,
-    {
-        let buf = serde_json::to_vec(value)?;
-        let bytes = Bytes::from(buf);
-        utils::backoff::retry(
-            || async {
-                let size = bytes.len();
-                let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
-                self.storage
-                    .upload_storage_object(bytes, size, path, &self.cancel)
-                    .await
-            },
-            remote_storage::TimeoutOrCancel::caused_by_cancel,
-            1,
-            u32::MAX,
-            &format!("put json {path}"),
-            &self.cancel,
-        )
-        .await
-        .expect("practically infinite retries")
-    }
-
    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
    pub async fn get_range(
        &self,
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
@@ -5,9 +5,3 @@ pub struct PgdataStatus {
    pub done: bool,
    // TODO: remaining fields
 }
-
-#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
-pub struct ShardStatus {
-    pub done: bool,
-    // TODO: remaining fields
-}
--- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs
@@ -64,4 +64,12 @@ impl Root {
            },
        }
    }
+    pub fn started_at(&self) -> &chrono::NaiveDateTime {
+        match self {
+            Root::V1(v1) => match v1 {
+                V1::InProgress(in_progress) => &in_progress.started_at,
+                V1::Done(done) => &done.started_at,
+            },
+        }
+    }
 }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -408,7 +408,7 @@ impl OpenFiles {
 /// error types may be elegible for retry.
 pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
+    match e.raw_os_error().map(nix::errno::Errno::from_raw) {
        Some(EIO) => {
            // Terminate on EIO because we no longer trust the device to store
            // data safely, or to uphold persistence guarantees on fsync.
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std(
 ) -> std::io::Error {
    match e {
        tokio_epoll_uring::Error::Op(e) => e,
-        tokio_epoll_uring::Error::System(system) => {
-            std::io::Error::new(std::io::ErrorKind::Other, system)
-        }
+        tokio_epoll_uring::Error::System(system) => std::io::Error::other(system),
    }
 }

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1316,6 +1316,10 @@ impl WalIngest {
            }
        });

+        if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
+            modification.tline.prepare_basebackup(lsn);
+        }
+
        Ok(())
    }

@@ -1684,31 +1688,31 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1719,7 +1723,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x20)),
+                    Version::at(Lsn(0x20)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1733,7 +1737,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x30)),
+                    Version::at(Lsn(0x30)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1747,7 +1751,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x40)),
+                    Version::at(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1760,7 +1764,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x40)),
+                    Version::at(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1774,7 +1778,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1787,7 +1791,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1800,7 +1804,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1820,7 +1824,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
                .await?,
            2
        );
@@ -1829,7 +1833,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x60)),
+                    Version::at(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1842,7 +1846,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x60)),
+                    Version::at(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1854,7 +1858,7 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1863,7 +1867,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1880,7 +1884,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
                .await?,
            0
        );
@@ -1893,7 +1897,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
                .await?,
            2
        );
@@ -1902,7 +1906,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x70)),
+                    Version::at(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1915,7 +1919,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x70)),
+                    Version::at(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1932,7 +1936,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            1501
        );
@@ -1942,7 +1946,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blk,
-                        Version::Lsn(Lsn(0x80)),
+                        Version::at(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -1956,7 +1960,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1500,
-                    Version::Lsn(Lsn(0x80)),
+                    Version::at(Lsn(0x80)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1990,13 +1994,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            1
        );
@@ -2011,7 +2015,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
                .await?,
            false
        );
@@ -2029,13 +2033,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
                .await?,
            1
        );
@@ -2077,26 +2081,26 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            relsize
        );
@@ -2110,7 +2114,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(lsn),
+                        Version::at(lsn),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2131,7 +2135,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
                .await?,
            1
        );
@@ -2144,7 +2148,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x60)),
+                        Version::at(Lsn(0x60)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2157,7 +2161,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            relsize
        );
@@ -2169,7 +2173,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x50)),
+                        Version::at(Lsn(0x50)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2193,13 +2197,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            relsize
        );
@@ -2212,7 +2216,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x80)),
+                        Version::at(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2250,7 +2254,7 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE + 1
        );
@@ -2264,7 +2268,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE
        );
@@ -2279,7 +2283,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE - 1
        );
@@ -2297,7 +2301,7 @@ mod tests {
            m.commit(&ctx).await?;
            assert_eq!(
                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                    .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                    .await?,
                size as BlockNumber
            );
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg)
 	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
 }

+void
+lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		hash;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	if (LFC_ENABLED())
+	{
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		{
+			tag.blockNum = blkno;
+			hash = get_hash_value(lfc_hash, &tag);
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			if (entry != NULL)
+			{
+				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				{
+					if (GET_STATE(entry, i) == AVAILABLE)
+					{
+						lfc_ctl->used_pages -= 1;
+						SET_STATE(entry, i, UNAVAILABLE);
+					}
+				}
+			}
+		}
+	}
+	LWLockRelease(lfc_lock);
+}

 /*
 * Check if page is present in the cache.
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -28,6 +28,7 @@ typedef struct FileCacheState
 extern bool lfc_store_prefetch_result;

 /* functions for local file cache */
+extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
 					   BlockNumber blkno, const void *const *buffers,
 					   BlockNumber nblocks);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #define InvalidRelFileNumber InvalidOid

-#define SMgrRelGetRelInfo(reln) \
+#define SMgrRelGetRelInfo(reln)				\
 	(reln->smgr_rnode.node)

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

+#define NRelFileInfoInvalidate(rinfo) do { \
+		NInfoGetSpcOid(rinfo) = InvalidOid; \
+		NInfoGetDbOid(rinfo) = InvalidOid; \
+		NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
+	} while (0)
+
 #if PG_MAJORVERSION_NUM < 17
 #define ProcNumber BackendId
 #define INVALID_PROC_NUMBER InvalidBackendId
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -108,7 +108,7 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;

-static SMgrRelation unlogged_build_rel = NULL;
+static NRelFileInfo unlogged_build_rel_info;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,16 +912,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
 			return;

 		default:
@@ -1003,21 +1006,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-			{
-				for (int i = 0; i < nblocks; i++)
-				{
-					lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-				}
-			}
 			return;

 		default:
@@ -1281,75 +1282,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }

-#if PG_MAJORVERSION_NUM < 17
-/*
- *	neon_read() -- Read the specified block from a relation.
- */
-#if PG_MAJORVERSION_NUM < 16
-static void
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
-#else
-static void
-neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
-#endif
-{
-	neon_request_lsns request_lsns;
-	bits8		present;
-	void	   *bufferp;
-
-	switch (reln->smgr_relpersistence)
-	{
-		case 0:
-			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
-
-		case RELPERSISTENCE_PERMANENT:
-			break;
-
-		case RELPERSISTENCE_TEMP:
-		case RELPERSISTENCE_UNLOGGED:
-			mdread(reln, forkNum, blkno, buffer);
-			return;
-
-		default:
-			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
-	}
-
-	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
-
-	present = 0;
-	bufferp = buffer;
-	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
-	{
-		/* Prefetch hit */
-		return;
-	}
-
-	/* Try to read from local file cache */
-	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
-	{
-		MyNeonCounters->file_cache_hits_total++;
-		return;
-	}
-
-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();
-
 #ifdef DEBUG_COMPARE_LOCAL
+static void
+compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn)
+{
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
 		char		pageserver_masked[BLCKSZ];
 		PGIOAlignedBlock mdbuf;
 		PGIOAlignedBlock mdbuf_masked;
-		XLogRecPtr  request_lsn = request_lsns.request_lsn;

+#if PG_MAJORVERSION_NUM >= 17
+		{
+			void* mdbuffers[1] = { mdbuf.data };
+			mdreadv(reln, forkNum, blkno, mdbuffers, 1);
+		}
+#else
 		mdread(reln, forkNum, blkno, mdbuf.data);
+#endif

 		memcpy(pageserver_masked, buffer, BLCKSZ);
 		memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ);
@@ -1413,11 +1363,111 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			}
 		}
 	}
+}
+#endif
+
+
+#if PG_MAJORVERSION_NUM < 17
+
+/*
+ *	neon_read() -- Read the specified block from a relation.
+ */
+#if PG_MAJORVERSION_NUM < 16
+static void
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
+#else
+static void
+neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
+#endif
+{
+	neon_request_lsns request_lsns;
+	bits8		present;
+	void	   *bufferp;
+
+	switch (reln->smgr_relpersistence)
+	{
+		case 0:
+			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;
+
+		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdread(reln, forkNum, blkno, buffer);
+				return;
+			}
+			break;
+
+		case RELPERSISTENCE_TEMP:
+		case RELPERSISTENCE_UNLOGGED:
+			mdread(reln, forkNum, blkno, buffer);
+			return;
+
+		default:
+			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
+	}
+
+	/* Try to read PS results if they are available */
+	communicator_prefetch_pump_state();
+
+	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+	present = 0;
+	bufferp = buffer;
+	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+	{
+		/* Prefetch hit */
+#ifdef DEBUG_COMPARE_LOCAL
+		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+#else
+		return;
+#endif
+	}
+
+	/* Try to read from local file cache */
+	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+	{
+		MyNeonCounters->file_cache_hits_total++;
+#ifdef DEBUG_COMPARE_LOCAL
+		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+#else
+		return;
+#endif
+	}
+
+	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
+	/*
+	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+	 */
+	communicator_prefetch_pump_state();
+
+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
 #endif
 }
 #endif /* PG_MAJORVERSION_NUM <= 16 */

 #if PG_MAJORVERSION_NUM >= 17
+
+#ifdef DEBUG_COMPARE_LOCAL
+static void
+compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages)
+{
+	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
+	{
+		for (BlockNumber i = 0; i < nblocks; i++)
+		{
+			if (BITMAP_ISSET(read_pages, i))
+			{
+				compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn);
+			}
+		}
+	}
+}
+#endif
+
+
 static void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   void **buffers, BlockNumber nblocks)
@@ -1431,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdreadv(reln, forknum, blocknum, buffers, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1460,8 +1516,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 													blocknum, request_lsns, nblocks,
 													buffers, read_pages);

+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	memset(read_pages, 0, sizeof(read_pages));
+#else
 	if (prefetch_result == nblocks)
 		return;
+#endif

 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
@@ -1470,9 +1531,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;

+#ifdef DEBUG_COMPARE_LOCAL
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+	memset(read_pages, 0, sizeof(read_pages));
+#else
 	/* Read all blocks from LFC, so we're done */
 	if (prefetch_result + lfc_result == nblocks)
 		return;
+#endif

 	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 							  buffers, nblocks, read_pages);
@@ -1483,91 +1549,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
-	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
-	{
-		char		pageserver_masked[BLCKSZ];
-		PGIOAlignedBlock mdbuf;
-		PGIOAlignedBlock mdbuf_masked;
-		XLogRecPtr  request_lsn = request_lsns->request_lsn;
-
-		for (int i = 0; i < nblocks; i++)
-		{
-			BlockNumber blkno = blocknum + i;
-			if (!BITMAP_ISSET(read_pages, i))
-				continue;
-
-#if PG_MAJORVERSION_NUM >= 17
-			{
-				void* mdbuffers[1] = { mdbuf.data };
-				mdreadv(reln, forknum, blkno, mdbuffers, 1);
-			}
-#else
-			mdread(reln, forknum, blkno, mdbuf.data);
-#endif
-
-			memcpy(pageserver_masked, buffers[i], BLCKSZ);
-			memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ);
-
-			if (PageIsNew((Page) mdbuf.data))
-			{
-				if (!PageIsNew((Page) pageserver_masked))
-				{
-					neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-						 blkno,
-						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						 forknum,
-						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-						 hexdump_page(buffers[i]));
-				}
-			}
-			else if (PageIsNew((Page) buffers[i]))
-			{
-				neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
-					 blkno,
-					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-					 forknum,
-					 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-					 hexdump_page(mdbuf.data));
-			}
-			else if (PageGetSpecialSize(mdbuf.data) == 0)
-			{
-				/* assume heap */
-				RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno);
-				RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno);
-
-				if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0)
-				{
-					neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-						 blkno,
-						 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-						 forknum,
-						 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-						 hexdump_page(mdbuf_masked.data),
-						 hexdump_page(pageserver_masked));
-				}
-			}
-			else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData)))
-			{
-				if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID)
-				{
-					/* assume btree */
-					RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno);
-					RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno);
-	
-					if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0)
-					{
-						neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
-							 blkno,
-							 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-							 forknum,
-							 (uint32) (request_lsn >> 32), (uint32) request_lsn,
-							 hexdump_page(mdbuf_masked.data),
-							 hexdump_page(pageserver_masked));
-					}
-				}
-			}
-		}
-	}
+	memset(read_pages, 0xFF, sizeof(read_pages));
+	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
 #endif
 }
 #endif
@@ -1638,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1647,9 +1639,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			#else
 			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 			#endif
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1710,14 +1699,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1753,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				return mdnblocks(reln, forknum);
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1822,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdtruncate(reln, forknum, old_blocks, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1960,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1977,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
+			unlogged_build_rel_info = InfoFromSMgrRel(reln);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
 #ifdef DEBUG_COMPARE_LOCAL
 			if (!IsParallelWorker())
@@ -1998,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
 #endif

-	unlogged_build_rel = reln;
+	unlogged_build_rel_info = InfoFromSMgrRel(reln);
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;

-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
 	/*
 	 * Create the local file. In a parallel build, the leader is expected to
 	 * call this first and do it.
@@ -2030,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln)
 static void
 neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 {
-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+					RelFileInfoFmt((unlogged_build_rel_info)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;

 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 	/*
 	 * In a parallel build, (only) the leader process performs the 2nd
@@ -2048,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	 */
 	if (IsParallelWorker())
 	{
-		unlogged_build_rel = NULL;
+		NRelFileInfoInvalidate(unlogged_build_rel_info);
 		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 	}
 	else
@@ -2069,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 {
 	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);

-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
+					RelFileInfoFmt(unlogged_build_rel_info))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2081,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 		BlockNumber nblocks;

 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 		/*
 		 * Update the last-written LSN cache.
@@ -2102,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 								InfoFromNInfoB(rinfob),
 								MAIN_FORKNUM);

-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
@@ -2113,6 +2104,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 forknum);

 			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+
 			mdclose(reln, forknum);
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
@@ -2123,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 	}
-	unlogged_build_rel = NULL;
+	NRelFileInfoInvalidate(unlogged_build_rel_info);
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

@@ -2196,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			 * Forget about any build we might have had in progress. The local
 			 * file will be unlinked by smgrDoPendingDeletes()
 			 */
-			unlogged_build_rel = NULL;
+			NRelFileInfoInvalidate(unlogged_build_rel_info);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 			break;

@@ -2208,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 		case XACT_EVENT_PRE_PREPARE:
 			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 			{
-				unlogged_build_rel = NULL;
+				NRelFileInfoInvalidate(unlogged_build_rel_info);
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"]

 [[package]]
 name = "flask-cors"
-version = "5.0.0"
-description = "A Flask extension adding a decorator for CORS support"
+version = "6.0.0"
+description = "A Flask extension simplifying CORS support"
 optional = false
-python-versions = "*"
+python-versions = "<4.0,>=3.9"
 groups = ["main"]
 files = [
-    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
-    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
+    {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
+    {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
 ]

 [package.dependencies]
-Flask = ">=0.9"
+flask = ">=0.9"
+Werkzeug = ">=0.7"

 [[package]]
 name = "frozenlist"
@@ -3169,19 +3170,24 @@ pbr = "*"

 [[package]]
 name = "setuptools"
-version = "70.0.0"
+version = "78.1.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
-    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
+    {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
+    {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
 ]

 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
+core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]

 [[package]]
 name = "six"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -127,3 +127,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
+tracing-test = "0.2"
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -80,10 +80,22 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .field(&endpoint.url())
                    .finish(),
                #[cfg(any(test, feature = "testing"))]
-                ControlPlaneClient::PostgresMock(endpoint) => fmt
-                    .debug_tuple("ControlPlane::PostgresMock")
-                    .field(&endpoint.url())
-                    .finish(),
+                ControlPlaneClient::PostgresMock(endpoint) => {
+                    let url = endpoint.url();
+                    match url::Url::parse(url) {
+                        Ok(mut url) => {
+                            let _ = url.set_password(Some("_redacted_"));
+                            let url = url.as_str();
+                            fmt.debug_tuple("ControlPlane::PostgresMock")
+                                .field(&url)
+                                .finish()
+                        }
+                        Err(_) => fmt
+                            .debug_tuple("ControlPlane::PostgresMock")
+                            .field(&url)
+                            .finish(),
+                    }
+                }
                #[cfg(test)]
                ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
            },
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,10 @@
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

+#[allow(non_upper_case_globals)]
+#[unsafe(export_name = "malloc_conf")]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    proxy::binary::proxy::run().await
--- a/Show More
+++ b/Show More