remove LayerKey usage

Signed-off-by: Alex Chi <chi@neon.tech>
fix tests
2026-01-31 09:10:38 +00:00 · 2023-06-28 10:00:02 -04:00 · 2023-06-28 09:58:31 -04:00 · 2023-06-27 16:57:11 -04:00 · 2023-06-27 16:55:56 -04:00 · 2023-06-27 16:54:51 -04:00
35 changed files with 591 additions and 849 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -722,35 +722,6 @@ jobs:
                           --dockerfile Dockerfile.compute-node
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
-                           --cleanup
-
-      # Due to a kaniko bug, we can't use cache for extensions image, thus it takes about the same amount of time as compute-node image to build (~10 min)
-      # During the transition period we need to have extensions in both places (in S3 and in compute-node image),
-      # so we won't build extension twice, but extract them from compute-node.
-      #
-      # For now we use extensions image only for new custom extensitons
-      - name: Kaniko build extensions only
-        run: |
-          # Kaniko is suposed to clean up after itself if --cleanup flag is set, but it doesn't.
-          # Despite some fixes were made in https://github.com/GoogleContainerTools/kaniko/pull/2504 (in kaniko v1.11.0),
-          # it still fails with error:
-          #   error building image: could not save file: copying file: symlink postgres /kaniko/1/usr/local/pgsql/bin/postmaster: file exists
-          #
-          # Ref https://github.com/GoogleContainerTools/kaniko/issues/1406
-          find /kaniko -maxdepth 1 -mindepth 1 -type d -regex "/kaniko/[0-9]*" -exec rm -rv {} \;
-
-          /kaniko/executor --reproducible --snapshot-mode=redo --skip-unused-stages --cache=true \
-                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache \
-                           --context . \
-                           --build-arg GIT_VERSION=${{ github.sha }} \
-                           --build-arg PG_VERSION=${{ matrix.version }} \
-                           --build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}} \
-                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com \
-                           --dockerfile Dockerfile.compute-node \
-                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --destination neondatabase/extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
-                           --cleanup \
-                           --target postgres-extensions

      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
      - name: Cleanup ECR folder
@@ -869,10 +840,8 @@ jobs:
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Push images to production ECR
        if: |
@@ -883,10 +852,8 @@ jobs:
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v14:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:latest
-          crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/extensions-v15:latest

      - name: Configure Docker Hub login
        run: |
@@ -908,89 +875,16 @@ jobs:
          crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v14:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
          crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
-          crane tag neondatabase/extensions-v15:${{needs.tag.outputs.build-tag}} latest

      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

-  upload-postgres-extensions-to-s3:
-    if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
-       github.event_name != 'workflow_dispatch'
-    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
-    needs: [ tag, promote-images ]
-    strategy:
-      fail-fast: false
-      matrix:
-        version: [ v14, v15 ]
-
-    env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
-      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
-      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
-
-    steps:
-      - name: Pull postgres-extensions image
-        run: |
-          docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}
-
-      - name: Create postgres-extensions container
-        id: create-container
-        run: |
-          EID=$(docker create ${EXTENSIONS_IMAGE} true)
-          echo "EID=${EID}" >> $GITHUB_OUTPUT
-
-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
-      - name: Extract postgres-extensions from container
-        run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
-
-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
-
-      - name: Upload postgres-extensions to S3
-        run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
-            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-          done
-
-      - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
-        run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
-          docker rm ${{ steps.create-container.outputs.EID }} || true
-
  deploy:
    runs-on: [ self-hosted, gen3, small ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    needs: [ upload-postgres-extensions-to-s3, promote-images, tag, regress-tests ]
+    needs: [ promote-images, tag, regress-tests ]
    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
    steps:
      - name: Fix git ownership
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4223,7 +4223,8 @@ dependencies = [
 [[package]]
 name = "tokio"
 version = "1.28.1"
-source = "git+https://github.com/problame/tokio.git?branch=problame/distinguish-core-and-worker-by-thread-name#d88791686cfc7fc7d010889ad7638d09646b3de7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
 dependencies = [
 "autocfg",
 "bytes",
@@ -4250,7 +4251,8 @@ dependencies = [
 [[package]]
 name = "tokio-macros"
 version = "2.1.0"
-source = "git+https://github.com/problame/tokio.git?branch=problame/distinguish-core-and-worker-by-thread-name#d88791686cfc7fc7d010889ad7638d09646b3de7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
 "proc-macro2",
 "quote",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -187,8 +187,6 @@ tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", re
 # until async safekeepers patch is merged to the main.
 sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }

-tokio = { git = "https://github.com/problame/tokio.git", branch="problame/distinguish-core-and-worker-by-thread-name" }
-
 ################# Binary contents sections

 [profile.release]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,26 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

-#########################################################################################
-#
-# Layer "pg-anon-pg-build"
-# compile anon extension
-#
-#########################################################################################
-FROM build-deps AS pg-anon-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
-    echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
-    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
-
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -643,7 +623,6 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
 #
 #########################################################################################
 FROM build-deps AS neon-pg-ext-build
-# Public extensions
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=postgis-build /sfcgal/* /
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -719,22 +698,6 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a

-#########################################################################################
-#
-# Extenstion only
-#
-#########################################################################################
-FROM scratch AS postgres-extensions
-# After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
-
 #########################################################################################
 #
 # Final layer
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -235,7 +235,7 @@ impl ComputeNode {

    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
-    #[instrument(skip_all, fields(%lsn))]
+    #[instrument(skip(self, compute_state))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Utc::now();
@@ -277,7 +277,7 @@ impl ComputeNode {

    // Run `postgres` in a special mode with `--sync-safekeepers` argument
    // and return the reported LSN back to the caller.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, storage_auth_token))]
    fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

@@ -322,7 +322,7 @@ impl ComputeNode {

    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, compute_state))]
    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
@@ -380,7 +380,7 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
@@ -404,7 +404,7 @@ impl ComputeNode {
    }

    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, compute_state))]
    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
        // If connection fails,
        // it may be the old node with `zenith_admin` superuser.
@@ -458,7 +458,7 @@ impl ComputeNode {
    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
-    #[instrument(skip_all)]
+    #[instrument(skip(self, client))]
    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
@@ -466,7 +466,7 @@ impl ComputeNode {

    /// Similar to `apply_config()`, but does a bit different sequence of operations,
    /// as it's used to reconfigure a previously started and configured Postgres node.
-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

@@ -501,7 +501,7 @@ impl ComputeNode {
        Ok(())
    }

-    #[instrument(skip_all)]
+    #[instrument(skip(self))]
    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
--- a/compute_tools/src/configurator.rs
+++ b/compute_tools/src/configurator.rs
@@ -8,7 +8,7 @@ use compute_api::responses::ComputeStatus;

 use crate::compute::ComputeNode;

-#[instrument(skip_all)]
+#[instrument(skip(compute))]
 fn configurator_main_loop(compute: &Arc<ComputeNode>) {
    info!("waiting for reconfiguration requests");
    loop {
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -215,7 +215,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
-#[instrument(skip_all, fields(pgdata = %pgdata.display()))]
+#[instrument(skip(pg))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
    let pid_path = pgdata.join("postmaster.pid");

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -308,8 +308,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_flag("force");
-    env.init(pg_version, force)
+    env.init(pg_version)
        .context("Failed to initialize neon repository")?;

    // Initialize pageserver, create initial tenant and timeline.
@@ -1014,13 +1013,6 @@ fn cli() -> Command {
        .help("If set, the node will be a hot replica on the specified timeline")
        .required(false);

-    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(bool))
-        .long("force")
-        .action(ArgAction::SetTrue)
-        .help("Force initialization even if the repository is not empty")
-        .required(false);
-
    Command::new("Neon CLI")
        .arg_required_else_help(true)
        .version(GIT_VERSION)
@@ -1036,7 +1028,6 @@ fn cli() -> Command {
                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
-                .arg(force_arg)
        )
        .subcommand(
            Command::new("timeline")
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -364,7 +364,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -372,29 +372,11 @@ impl LocalEnv {
            "repository base path is missing"
        );

-        if base_path.exists() {
-            if force {
-                println!("removing all contents of '{}'", base_path.display());
-                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                // S3 local SSD) to the `.neon` base directory.
-                for entry in std::fs::read_dir(base_path)? {
-                    let entry = entry?;
-                    let path = entry.path();
-                    if path.is_dir() {
-                        fs::remove_dir_all(&path)?;
-                    } else {
-                        fs::remove_file(&path)?;
-                    }
-                }
-            } else {
-                bail!(
-                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
-                    base_path.display()
-                );
-            }
-        }
-
+        ensure!(
+            !base_path.exists(),
+            "directory '{}' already exists. Perhaps already initialized?",
+            base_path.display()
+        );
        if !self.pg_bin_dir(pg_version)?.join("postgres").exists() {
            bail!(
                "Can't find postgres binary at {}",
@@ -410,9 +392,7 @@ impl LocalEnv {
            }
        }

-        if !base_path.exists() {
-            fs::create_dir(base_path)?;
-        }
+        fs::create_dir(base_path)?;

        // Generate keypair for JWT.
        //
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -23,6 +23,7 @@ use super::models::{
    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::disk_usage_eviction_task;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
@@ -34,7 +35,6 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
-use crate::{disk_usage_eviction_task, tenant};
 use utils::{
    auth::JwtAuth,
    http::{
@@ -328,17 +328,15 @@ async fn timeline_create_handler(
            &ctx,
        )
        .await {
-            Ok(new_timeline) => {
+            Ok(Some(new_timeline)) => {
                // Created. Construct a TimelineInfo for it.
                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
-            Err(tenant::CreateTimelineError::AlreadyExists) => {
-                json_response(StatusCode::CONFLICT, ())
-            }
-            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
        }
    }
    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -130,66 +130,6 @@ pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_accesses_total",
-        "Number of read accesses to the page cache",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE_READ_ACCESSES_MATERIALIZED_PAGE: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_ACCESSES
-        .get_metric_with_label_values(&["materialized_page"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_ACCESSES_EPHEMERAL: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_ACCESSES
-        .get_metric_with_label_values(&["ephemeral"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_ACCESSES_IMMUTABLE: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_ACCESSES
-        .get_metric_with_label_values(&["immutable"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_page_cache_read_hits_total",
-        "Number of read accesses to the page cache that hit",
-        &["key_kind", "hit_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE_READ_HITS_EPHEMERAL: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_HITS
-        .get_metric_with_label_values(&["ephemeral", "-"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_HITS_IMMUTABLE: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_HITS
-        .get_metric_with_label_values(&["immutable", "-"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_EXACT: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_HITS
-        .get_metric_with_label_values(&["materialized_page", "exact"])
-        .unwrap()
-});
-
-pub static PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_OLDER_LSN: Lazy<IntCounter> = Lazy::new(|| {
-    PAGE_CACHE_READ_HITS
-        .get_metric_with_label_values(&["materialized_page", "older_lsn"])
-        .unwrap()
-});
-
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
@@ -674,79 +614,6 @@ pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
 pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
    Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));

-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_STARTED_COUNT: Lazy<IntCounter> =
-    Lazy::new(|| {
-        register_int_counter!(
-            "pageserver_layer_get_value_reconstruct_data_spawn_blocking_started_count",
-            "Number of spawn_blocking calls made in Layer::get_value_reconstruct_data"
-        )
-        .expect("failed to define a metric")
-    });
-
-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE: Lazy<IntGauge> =
-    Lazy::new(|| {
-        register_int_gauge!(
-            "pageserver_layer_get_value_reconstruct_data_spawn_blocking_active_gauge",
-            "Number of spawn_blocking calls active in Layer::get_value_reconstruct_data"
-        )
-        .expect("failed to define a metric")
-    });
-
-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_QUEUE_DELAY: Lazy<Histogram> = Lazy::new(
-    || {
-        register_histogram!(
-            "pageserver_layer_get_value_reconstruct_data_spawn_blocking_queue_delay_seconds",
-            "Time a Layer::get_value_reconstruct_data call spends in spawn_blocking queue until the first line of blockign code runs inside spawn_blocking",
-            vec![
-            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
-            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000, 0.500_000,
-            1.000_000, 2.000_000, 5.000_000, 10.000_000, 25.000_000, 50.000_000, 100.000_000,
-            ],
-        )
-        .expect("failed to define a metric")
-    },
-);
-
-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_layer_get_value_reconstruct_data_completion_time_seconds",
-        "Time a Layer::get_value_reconstruct_data call takes to complete",
-        &["result"],
-        vec![
-            0.000_005,
-            0.000_010,
-            0.000_025,
-            0.000_050,
-            0.000_100,
-            0.000_250,
-            0.000_500,
-            0.001_000,
-            0.002_500,
-            0.005_000,
-            0.010_000,
-            0.025_000,
-            0.050_000,
-            0.100_000,
-            0.250_000,
-            0.500_000,
-            1.000_000,
-            2.000_000,
-            5.000_000,
-            10.000_000,
-            25.000_000,
-            50.000_000,
-            100.000_000,
-        ]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_OK: Lazy<Histogram> =
-    Lazy::new(|| LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME.with_label_values(&["ok"]));
-
-pub static LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_ERROR: Lazy<Histogram> =
-    Lazy::new(|| LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME.with_label_values(&["error"]));
-
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -313,8 +313,6 @@ impl PageCache {
        key: &Key,
        lsn: Lsn,
    ) -> Option<(Lsn, PageReadGuard)> {
-        crate::metrics::PAGE_CACHE_READ_ACCESSES_MATERIALIZED_PAGE.inc();
-
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
                tenant_id,
@@ -325,17 +323,8 @@ impl PageCache {
        };

        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
-            if let CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: available_lsn,
-            } = cache_key
-            {
-                if available_lsn == lsn {
-                    crate::metrics::PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_EXACT.inc();
-                } else {
-                    crate::metrics::PAGE_CACHE_READ_HITS_MATERIALIZED_PAGE_OLDER_LSN.inc();
-                }
-                Some((available_lsn, guard))
+            if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
+                Some((lsn, guard))
            } else {
                panic!("unexpected key type in slot");
            }
@@ -510,31 +499,11 @@ impl PageCache {
    /// ```
    ///
    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
-        let (read_access, hit) = match cache_key {
-            CacheKey::MaterializedPage { .. } => {
-                unreachable!("Materialized pages use lookup_materialized_page")
-            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE_READ_ACCESSES_EPHEMERAL,
-                &crate::metrics::PAGE_CACHE_READ_HITS_EPHEMERAL,
-            ),
-            CacheKey::ImmutableFilePage { .. } => (
-                &crate::metrics::PAGE_CACHE_READ_ACCESSES_IMMUTABLE,
-                &crate::metrics::PAGE_CACHE_READ_HITS_IMMUTABLE,
-            ),
-        };
-        read_access.inc();
-
-        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
-                if is_first_iteration {
-                    hit.inc();
-                }
                return Ok(ReadBufResult::Found(read_guard));
            }
-            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) =
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -904,7 +904,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let lsn = if params.len() >= 3 {
+            let lsn = if params.len() == 3 {
                Some(
                    Lsn::from_str(params[2])
                        .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -887,7 +887,7 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> Result<(), RelationError> {
        if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            return Err(RelationError::AlreadyExists);
        }
        // It's possible that this is the first rel for this db in this
        // tablespace.  Create the reldir entry for it if so.
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -102,33 +102,11 @@ use crate::shutdown_pageserver;
 // It's also good to avoid hogging all threads that would be needed to process
 // other operations, if the upload tasks e.g. get blocked on locks. It shouldn't
 // happen, but still.
-
-static PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE: Lazy<Option<usize>> = Lazy::new(|| {
-    let env_var: String = match std::env::var("PAGESERVER_TOKIO_MAX_BLOCKING_THREADS") {
-        Ok(v) => v,
-        Err(std::env::VarError::NotPresent) => {
-            debug!("env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS not set, using default");
-            return None;
-        }
-        Err(std::env::VarError::NotUnicode(_)) => {
-            panic!("env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS is not valid UTF-8");
-        }
-    };
-    let pool_size = match env_var.parse() {
-        Ok(v) => v,
-        Err(e) => {
-            panic!("Failed to parse PAGESERVER_TOKIO_MAX_BLOCKING_THREADS: {e:?}");
-        }
-    };
-    eprintln!("using spawn_blocking pool size override from env var PAGESERVER_TOKIO_MAX_BLOCKING_THREADS: {pool_size:?}");
-    Some(pool_size)
-});
-
+//
 pub static COMPUTE_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("compute request worker")
        .enable_all()
-        .max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
        .build()
        .expect("Failed to create compute request runtime")
 });
@@ -137,7 +115,6 @@ pub static MGMT_REQUEST_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("mgmt request worker")
        .enable_all()
-        .max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
        .build()
        .expect("Failed to create mgmt request runtime")
 });
@@ -146,7 +123,6 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("walreceiver worker")
        .enable_all()
-        .max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
        .build()
        .expect("Failed to create walreceiver runtime")
 });
@@ -155,7 +131,6 @@ pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
    tokio::runtime::Builder::new_multi_thread()
        .thread_name("background op worker")
        .enable_all()
-        .max_blocking_threads((*PAGESERVER_TOKIO_MAX_BLOCKING_THREADS_OVERRIDE).unwrap_or(512))
        .build()
        .expect("Failed to create background op runtime")
 });
@@ -531,17 +506,17 @@ pub async fn shutdown_tasks(
                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
-            let join_handle = tokio::select! {
+            let completed = tokio::select! {
                biased;
-                _ = &mut join_handle => { None },
+                _ = &mut join_handle => { true },
                _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                    // allow some time to elapse before logging to cut down the number of log
                    // lines.
                    info!("waiting for {} to shut down", task.name);
-                    Some(join_handle)
+                    false
                }
            };
-            if let Some(join_handle) = join_handle {
+            if !completed {
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -85,6 +85,7 @@ pub mod blob_io;
 pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
+pub mod layer_cache;
 pub mod layer_map;
 pub mod manifest;

@@ -501,14 +502,6 @@ impl DeletionGuard {
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum CreateTimelineError {
-    #[error("a timeline with the given ID already exists")]
-    AlreadyExists,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1383,7 +1376,8 @@ impl Tenant {
    /// Returns the new timeline ID and reference to its Timeline object.
    ///
    /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
-    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
+    /// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
+    /// a new unique ID is generated.
    pub async fn create_timeline(
        &self,
        new_timeline_id: TimelineId,
@@ -1392,12 +1386,11 @@ impl Tenant {
        pg_version: u32,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
-    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        if !self.is_active() {
-            return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "Cannot create timelines on inactive tenant"
-            )));
-        }
+    ) -> anyhow::Result<Option<Arc<Timeline>>> {
+        anyhow::ensure!(
+            self.is_active(),
+            "Cannot create timelines on inactive tenant"
+        );

        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");
@@ -1417,7 +1410,7 @@ impl Tenant {
                    .context("wait for timeline uploads to complete")?;
            }

-            return Err(CreateTimelineError::AlreadyExists);
+            return Ok(None);
        }

        let loaded_timeline = match ancestor_timeline_id {
@@ -1432,12 +1425,12 @@ impl Tenant {
                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
                        // can we safely just branch from the ancestor instead?
-                        return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                        bail!(
                            "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                            lsn,
                            ancestor_timeline_id,
                            ancestor_ancestor_lsn,
-                        )));
+                        );
                    }

                    // Wait for the WAL to arrive and be processed on the parent branch up
@@ -1471,7 +1464,7 @@ impl Tenant {
            })?;
        }

-        Ok(loaded_timeline)
+        Ok(Some(loaded_timeline))
    }

    /// perform one garbage collection iteration, removing old data files from disk.
@@ -1625,7 +1618,7 @@ impl Tenant {
            // No timeout here, GC & Compaction should be responsive to the
            // `TimelineState::Stopping` change.
            info!("waiting for layer_removal_cs.lock()");
-            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            let layer_removal_guard = timeline.layer_cache.delete_guard().await;
            info!("got layer_removal_cs.lock(), deleting layer files");

            // NB: storage_sync upload tasks that reference these layers have been cancelled
--- a/pageserver/src/tenant/layer_cache.rs
+++ b/pageserver/src/tenant/layer_cache.rs
@@ -0,0 +1,146 @@
+use super::storage_layer::{PersistentLayer, PersistentLayerDesc, PersistentLayerKey, RemoteLayer};
+use super::Timeline;
+use crate::tenant::layer_map::LayerMap;
+use crate::tenant::timeline::compare_arced_layers;
+use anyhow::Result;
+use std::sync::{Mutex, Weak};
+use std::{collections::HashMap, sync::Arc};
+
+/// LayerCache is meant to facilitate mapping to/from whatever `PersistentLayerDesc` to an actual in-memory layer
+/// object. In the future, operations that do not modify layer map (i.e., eviction and download) will be implemented
+/// here.
+pub struct LayerCache {
+    /// Layer removal lock.
+    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
+    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
+    pub layers_removal_lock: Arc<tokio::sync::Mutex<()>>,
+
+    /// We need this lock b/c we do not have any way to prevent GC/compaction from removing files in-use.
+    /// We need to do reference counting on Arc to prevent this from happening, and we can safely remove this lock.
+    pub layers_operation_lock: Arc<tokio::sync::RwLock<()>>,
+
+    /// Will be useful when we move evict / download to layer cache.
+    #[allow(unused)]
+    timeline: Weak<Timeline>,
+
+    mapping: Mutex<HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>>,
+}
+
+pub struct LayerInUseWrite(tokio::sync::OwnedRwLockWriteGuard<()>);
+
+pub struct LayerInUseRead(tokio::sync::OwnedRwLockReadGuard<()>);
+
+#[derive(Clone)]
+pub struct LayerDeletionGuard(Arc<tokio::sync::OwnedMutexGuard<()>>);
+
+impl LayerCache {
+    pub fn new(timeline: Weak<Timeline>) -> Self {
+        Self {
+            layers_operation_lock: Arc::new(tokio::sync::RwLock::new(())),
+            layers_removal_lock: Arc::new(tokio::sync::Mutex::new(())),
+            mapping: Mutex::new(HashMap::new()),
+            timeline,
+        }
+    }
+
+    pub fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
+        let guard = self.mapping.lock().unwrap();
+        guard.get(&desc.key()).expect("not found").clone()
+    }
+
+    /// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
+    /// we won't delete files that are being read.
+    pub async fn layer_in_use_write(&self) -> LayerInUseWrite {
+        LayerInUseWrite(self.layers_operation_lock.clone().write_owned().await)
+    }
+
+    /// This function is to mock the original behavior of `layers` lock in `Timeline`. Can be removed after we ensure
+    /// we won't delete files that are being read.
+    pub async fn layer_in_use_read(&self) -> LayerInUseRead {
+        LayerInUseRead(self.layers_operation_lock.clone().read_owned().await)
+    }
+
+    /// Ensures only one of compaction / gc can happen at a time.
+    pub async fn delete_guard(&self) -> LayerDeletionGuard {
+        LayerDeletionGuard(Arc::new(
+            self.layers_removal_lock.clone().lock_owned().await,
+        ))
+    }
+
+    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
+    pub fn remove_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
+        let mut guard = self.mapping.lock().unwrap();
+        guard.remove(&layer.layer_desc().key());
+    }
+
+    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
+    pub fn populate_remote_when_init(&self, layer: Arc<RemoteLayer>) {
+        let mut guard = self.mapping.lock().unwrap();
+        guard.insert(layer.layer_desc().key(), layer);
+    }
+
+    /// Should only be called when initializing the timeline. Bypass checks and layer operation lock.
+    pub fn populate_local_when_init(&self, layer: Arc<dyn PersistentLayer>) {
+        let mut guard = self.mapping.lock().unwrap();
+        guard.insert(layer.layer_desc().key(), layer);
+    }
+
+    /// Called within read path.
+    pub fn replace_and_verify(
+        &self,
+        expected: Arc<dyn PersistentLayer>,
+        new: Arc<dyn PersistentLayer>,
+    ) -> Result<()> {
+        let mut guard = self.mapping.lock().unwrap();
+
+        let key = expected.layer_desc().key();
+        let other = new.layer_desc().key();
+
+        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
+        let new_l0 = LayerMap::is_l0(new.layer_desc());
+
+        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
+            "layermap-replace-notfound"
+        ));
+
+        anyhow::ensure!(
+            key == other,
+            "replacing downloaded layer into layermap failed because two layers have different keys: {key:?} != {other:?}"
+        );
+
+        anyhow::ensure!(
+             expected_l0 == new_l0,
+             "replacing downloaded layer into layermap failed because one layer is l0 while the other is not: {expected_l0} != {new_l0}"
+         );
+
+        if let Some(layer) = guard.get_mut(&expected.layer_desc().key()) {
+            anyhow::ensure!(
+                compare_arced_layers(&expected, layer),
+                "replacing downloaded layer into layermap failed because another layer was found instead of expected, expected={expected:?}, new={new:?}",
+                expected = Arc::as_ptr(&expected),
+                new = Arc::as_ptr(layer),
+            );
+            *layer = new;
+            Ok(())
+        } else {
+            anyhow::bail!(
+                "replacing downloaded layer into layermap failed because layer was not found"
+            );
+        }
+    }
+
+    /// Called within write path. When compaction and image layer creation we will create new layers.
+    pub fn create_new_layer(&self, layer: Arc<dyn PersistentLayer>) {
+        let mut guard = self.mapping.lock().unwrap();
+        guard.insert(layer.layer_desc().key(), layer);
+    }
+
+    /// Called within write path. When GC and compaction we will remove layers and delete them on disk.
+    /// Will move logic to delete files here later.
+    pub fn delete_layer(&self, layer: Arc<dyn PersistentLayer>) {
+        let mut guard = self.mapping.lock().unwrap();
+        guard.remove(&layer.layer_desc().key());
+    }
+}
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -58,7 +58,7 @@ use std::sync::Arc;
 use utils::lsn::Lsn;

 use historic_layer_coverage::BufferedHistoricLayerCoverage;
-pub use historic_layer_coverage::LayerKey;
+pub use historic_layer_coverage::{LayerKey, Replacement};

 use super::storage_layer::range_eq;
 use super::storage_layer::PersistentLayerDesc;
@@ -658,10 +658,7 @@ mod tests {

    mod l0_delta_layers_updated {

-        use crate::tenant::{
-            storage_layer::{PersistentLayer, PersistentLayerDesc},
-            timeline::LayerFileManager,
-        };
+        use crate::tenant::storage_layer::{PersistentLayer, PersistentLayerDesc};

        use super::*;

@@ -694,31 +691,6 @@ mod tests {
             )
        }

-        #[test]
-        fn replacing_missing_l0_is_notfound() {
-            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
-            // however only happen for precondition failures.
-
-            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
-            let layer = LayerFileName::from_str(layer).unwrap();
-            let layer = LayerDescriptor::from(layer);
-
-            // same skeletan construction; see scenario below
-            let not_found = Arc::new(layer.clone());
-            let new_version = Arc::new(layer);
-
-            // after the immutable storage state refactor, the replace operation
-            // will not use layer map any more. We keep it here for consistency in test cases
-            // and can remove it in the future.
-            let _map = LayerMap::default();
-
-            let mut mapping = LayerFileManager::new();
-
-            mapping
-                .replace_and_verify(not_found, new_version)
-                .unwrap_err();
-        }
-
        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = LayerDescriptor::from(name);
@@ -727,7 +699,6 @@ mod tests {
            let downloaded = Arc::new(skeleton);

            let mut map = LayerMap::default();
-            let mut mapping = LayerFileManager::new();

            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
@@ -737,20 +708,11 @@ mod tests {

            map.batch_update()
                .insert_historic(remote.layer_desc().clone());
-            mapping.insert(remote.clone());
            assert_eq!(
                count_layer_in(&map, remote.layer_desc()),
                expected_in_counts
            );

-            mapping
-                .replace_and_verify(remote, downloaded.clone())
-                .expect("name derived attributes are the same");
-            assert_eq!(
-                count_layer_in(&map, downloaded.layer_desc()),
-                expected_in_counts
-            );
-
            map.batch_update()
                .remove_historic(downloaded.layer_desc().clone());
            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -43,6 +43,18 @@ impl Ord for LayerKey {
    }
 }

+impl<'a, L: crate::tenant::storage_layer::Layer + ?Sized> From<&'a L> for LayerKey {
+    fn from(layer: &'a L) -> Self {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        }
+    }
+}
+
 impl From<&PersistentLayerDesc> for LayerKey {
    fn from(layer: &PersistentLayerDesc) -> Self {
        let kr = layer.get_key_range();
@@ -456,6 +468,64 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
        self.buffer.insert(layer_key, None);
    }

+    /// Replaces a previous layer with a new layer value.
+    ///
+    /// The replacement is conditional on:
+    /// - there is an existing `LayerKey` record
+    /// - there is no buffered removal for the given `LayerKey`
+    /// - the given closure returns true for the current `Value`
+    ///
+    /// The closure is used to compare the latest value (buffered insert, or existing layer)
+    /// against some expectation. This allows to use `Arc::ptr_eq` or similar which would be
+    /// inaccessible via `PartialEq` trait.
+    ///
+    /// Returns a `Replacement` value describing the outcome; only the case of
+    /// `Replacement::Replaced` modifies the map and requires a rebuild.
+    ///
+    /// This function is unlikely to be used in the future because LayerMap now only records the
+    /// layer descriptors. Therefore, anything added to the layer map will only be removed or
+    /// added, and never replaced.
+    #[cfg(test)]
+    pub fn replace<F>(
+        &mut self,
+        layer_key: &LayerKey,
+        new: Value,
+        check_expected: F,
+    ) -> Replacement<Value>
+    where
+        F: FnOnce(&Value) -> bool,
+    {
+        let (slot, in_buffered) = match self.buffer.get(layer_key) {
+            Some(inner @ Some(_)) => {
+                // we compare against the buffered version, because there will be a later
+                // rebuild before querying
+                (inner.as_ref(), true)
+            }
+            Some(None) => {
+                // buffer has removal for this key; it will not be equivalent by any check_expected.
+                return Replacement::RemovalBuffered;
+            }
+            None => {
+                // no pending modification for the key, check layers
+                (self.layers.get(layer_key), false)
+            }
+        };
+
+        match slot {
+            Some(existing) if !check_expected(existing) => {
+                // unfortunate clone here, but otherwise the nll borrowck grows the region of
+                // 'a to cover the whole function, and we could not mutate in the other
+                // Some(existing) branch
+                Replacement::Unexpected(existing.clone())
+            }
+            None => Replacement::NotFound,
+            Some(_existing) => {
+                self.insert(layer_key.to_owned(), new);
+                Replacement::Replaced { in_buffered }
+            }
+        }
+    }
+
    pub fn rebuild(&mut self) {
        // Find the first LSN that needs to be rebuilt
        let rebuild_since: u64 = match self.buffer.iter().next() {
@@ -524,6 +594,22 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
    }
 }

+/// Outcome of the replace operation.
+#[derive(Debug)]
+pub enum Replacement<Value> {
+    /// Previous value was replaced with the new value.
+    Replaced {
+        /// Replacement happened for a scheduled insert.
+        in_buffered: bool,
+    },
+    /// Key was not found buffered updates or existing layers.
+    NotFound,
+    /// Key has been scheduled for removal, it was not replaced.
+    RemovalBuffered,
+    /// Previous value was rejected by the closure.
+    Unexpected(Value),
+}
+
 #[test]
 fn test_retroactive_regression_1() {
    let mut map = BufferedHistoricLayerCoverage::new();
@@ -632,3 +718,139 @@ fn test_retroactive_simple() {
        assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
    }
 }
+
+#[test]
+fn test_retroactive_replacement() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    let keys = [
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+    ];
+
+    let layers = [
+        "Image 1".to_string(),
+        "Image 2".to_string(),
+        "Image 3".to_string(),
+    ];
+
+    for (key, layer) in keys.iter().zip(layers.iter()) {
+        map.insert(key.to_owned(), layer.to_owned());
+    }
+
+    // rebuild is not necessary here, because replace works for both buffered updates and existing
+    // layers.
+
+    for (key, orig_layer) in keys.iter().zip(layers.iter()) {
+        let replacement = format!("Remote {orig_layer}");
+
+        // evict
+        let ret = map.replace(key, replacement.clone(), |l| l == orig_layer);
+        assert!(
+            matches!(ret, Replacement::Replaced { .. }),
+            "replace {orig_layer}: {ret:?}"
+        );
+        map.rebuild();
+
+        let at = key.lsn.end + 1;
+
+        let version = map.get().expect("rebuilt").get_version(at).unwrap();
+        assert_eq!(
+            version.image_coverage.query(4).as_deref(),
+            Some(replacement.as_str()),
+            "query for 4 at version {at} after eviction",
+        );
+
+        // download
+        let ret = map.replace(key, orig_layer.clone(), |l| l == &replacement);
+        assert!(
+            matches!(ret, Replacement::Replaced { .. }),
+            "replace {orig_layer} back: {ret:?}"
+        );
+        map.rebuild();
+        let version = map.get().expect("rebuilt").get_version(at).unwrap();
+        assert_eq!(
+            version.image_coverage.query(4).as_deref(),
+            Some(orig_layer.as_str()),
+            "query for 4 at version {at} after download",
+        );
+    }
+}
+
+#[test]
+fn missing_key_is_not_inserted_with_replace() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+    let key = LayerKey {
+        key: 0..5,
+        lsn: 100..101,
+        is_image: true,
+    };
+
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(matches!(ret, Replacement::NotFound), "{ret:?}");
+    map.rebuild();
+    assert!(map
+        .get()
+        .expect("no changes to rebuild")
+        .get_version(102)
+        .is_none());
+}
+
+#[test]
+fn replacing_buffered_insert_and_remove() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+    let key = LayerKey {
+        key: 0..5,
+        lsn: 100..101,
+        is_image: true,
+    };
+
+    map.insert(key.clone(), "Image 1");
+    let ret = map.replace(&key, "Remote Image 1", |&l| l == "Image 1");
+    assert!(
+        matches!(ret, Replacement::Replaced { in_buffered: true }),
+        "{ret:?}"
+    );
+    map.rebuild();
+
+    assert_eq!(
+        map.get()
+            .expect("rebuilt")
+            .get_version(102)
+            .unwrap()
+            .image_coverage
+            .query(4),
+        Some("Remote Image 1")
+    );
+
+    map.remove(key.clone());
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(
+        matches!(ret, Replacement::RemovalBuffered),
+        "cannot replace after scheduled remove: {ret:?}"
+    );
+
+    map.rebuild();
+
+    let ret = map.replace(&key, "should not replace", |_| true);
+    assert!(
+        matches!(ret, Replacement::NotFound),
+        "cannot replace after remove + rebuild: {ret:?}"
+    );
+
+    let at_version = map.get().expect("rebuilt").get_version(102);
+    assert!(at_version.is_none());
+}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
-use anyhow::{Context, Result};
+use anyhow::Result;
 use bytes::Bytes;
 use enum_map::EnumMap;
 use enumset::EnumSet;
@@ -24,7 +24,7 @@ use pageserver_api::models::{
 use std::ops::Range;
 use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
-use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
 use utils::rate_limit::RateLimit;
@@ -335,8 +335,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-#[async_trait::async_trait]
-pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
+pub trait Layer: std::fmt::Debug + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -366,74 +365,13 @@ pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
    /// is available. If this returns ValueReconstructResult::Continue, look up
    /// the predecessor layer and call again with the same 'reconstruct_data' to
    /// collect more data.
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        reconstruct_data: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)>;
-
-    /// CANCEL SAFETY: if the returned future is dropped,
-    /// the wrapped closure still run to completion and the return value discarded.
-    /// For the case of get_value_reconstruct_data, we expect the closure to not
-    /// have any side effects, as it only attempts to read a layer (and stuff like
-    /// page cache isn't considered a real side effect).
-    /// But, ...
-    /// TRACING:
-    /// If the returned future is cancelled, the spawn_blocking span can outlive
-    /// the caller's span.
-    /// So, technically, we should be using `parent: None` and `follows_from: current`
-    /// instead. However, in practice, the advantage of maintaining the span stack
-    /// in logs outweighs the disadvantage of having a dangling span in a case that
-    /// is not expected to happen because in pageserver we generally don't drop pending futures.
-    async fn get_value_reconstruct_data(
-        self: Arc<Self>,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_data: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
-        let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
-        static USE_SPAWN_BLOCKING: Lazy<bool> = Lazy::new(|| {
-            let val = std::env::var("PAGESERVER_LAYER_GET_RECONSTRUCT_DATA_USE_SPAWN_BLOCKING")
-                .map(|s| s == "1")
-                .unwrap_or(false);
-            tracing::info!("PAGESERVER_LAYER_GET_RECONSTRUCT_DATA_USE_SPAWN_BLOCKING={val}");
-            val
-        });
-        let use_spawn_blocking = *USE_SPAWN_BLOCKING;
-        let start = Instant::now();
-        let res = if !use_spawn_blocking {
-            anyhow::Ok(self.get_value_reconstruct_data_blocking(
-                key,
-                lsn_range,
-                reconstruct_data,
-                ctx,
-            ))
-        } else {
-            crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_STARTED_COUNT.inc();
-            crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE.inc();
-            let res = tokio::task::spawn_blocking(move || {
-                crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_QUEUE_DELAY
-                    .observe(start.elapsed().as_secs_f64());
-                let _enter = span.enter();
-                self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
-            })
-            .await
-            .context("spawn_blocking");
-            crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_SPAWN_BLOCKING_ACTIVE_GAUGE.dec();
-            res
-        };
-        let histo = match &res {
-            Ok(Ok(_)) => &crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_OK,
-            Ok(Err(_)) | Err(_) => {
-                &crate::metrics::LAYER_GET_VALUE_RECONSTRUCT_DATA_COMPLETION_TIME_ERROR
-            }
-        };
-        histo.observe(start.elapsed().as_secs_f64());
-        res?
-    }
+        reconstruct_data: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult>;

    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
    fn short_id(&self) -> String;
@@ -545,8 +483,17 @@ pub mod tests {
        }
    }

-    #[async_trait::async_trait]
    impl Layer for LayerDescriptor {
+        fn get_value_reconstruct_data(
+            &self,
+            _key: Key,
+            _lsn_range: Range<Lsn>,
+            _reconstruct_data: &mut ValueReconstructState,
+            _ctx: &RequestContext,
+        ) -> Result<ValueReconstructResult> {
+            todo!("This method shouldn't be part of the Layer trait")
+        }
+
        fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
            todo!()
        }
@@ -561,16 +508,6 @@ pub mod tests {
            self.layer_desc().lsn_range.clone()
        }

-        fn get_value_reconstruct_data_blocking(
-            &self,
-            _key: Key,
-            _lsn_range: Range<Lsn>,
-            _reconstruct_data: ValueReconstructState,
-            _ctx: RequestContext,
-        ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
-            todo!("This method shouldn't be part of the Layer trait")
-        }
-
        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
        fn is_incremental(&self) -> bool {
            self.layer_desc().is_incremental
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -218,7 +218,6 @@ impl std::fmt::Debug for DeltaLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -295,13 +294,13 @@ impl Layer for DeltaLayer {
        Ok(())
    }

-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.desc.lsn_range.start);
        let mut need_image = true;

@@ -309,7 +308,7 @@ impl Layer for DeltaLayer {

        {
            // Open the file and lock the metadata in memory
-            let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

            // Scan the page versions backwards, starting from `lsn`.
            let file = &inner.file;
@@ -375,9 +374,9 @@ impl Layer for DeltaLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok((reconstruct_state, ValueReconstructResult::Continue))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        }
    }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -149,7 +149,6 @@ impl std::fmt::Debug for ImageLayerInner {
    }
 }

-#[async_trait::async_trait]
 impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
@@ -182,18 +181,18 @@ impl Layer for ImageLayer {
    }

    /// Look up given page in the file
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        assert!(self.desc.key_range.contains(&key));
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;

        let file = inner.file.as_ref().unwrap();
        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
@@ -211,9 +210,9 @@ impl Layer for ImageLayer {
            let value = Bytes::from(blob);

            reconstruct_state.img = Some((self.lsn, value));
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Missing))
+            Ok(ValueReconstructResult::Missing)
        }
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -110,7 +110,6 @@ impl InMemoryLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for InMemoryLayer {
    fn get_key_range(&self) -> Range<Key> {
        Key::MIN..Key::MAX
@@ -191,13 +190,13 @@ impl Layer for InMemoryLayer {
    }

    /// Look up given value in the layer.
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        key: Key,
        lsn_range: Range<Lsn>,
-        mut reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
+        reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

@@ -214,7 +213,7 @@ impl Layer for InMemoryLayer {
                match value {
                    Value::Image(img) => {
                        reconstruct_state.img = Some((*entry_lsn, img));
-                        return Ok((reconstruct_state, ValueReconstructResult::Complete));
+                        return Ok(ValueReconstructResult::Complete);
                    }
                    Value::WalRecord(rec) => {
                        let will_init = rec.will_init();
@@ -234,9 +233,9 @@ impl Layer for InMemoryLayer {
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
-            Ok((reconstruct_state, ValueReconstructResult::Continue))
+            Ok(ValueReconstructResult::Continue)
        } else {
-            Ok((reconstruct_state, ValueReconstructResult::Complete))
+            Ok(ValueReconstructResult::Complete)
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
 use crate::repository::Key;
 use crate::tenant::layer_map::BatchedUpdates;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
-use crate::tenant::storage_layer::{Layer, ValueReconstructState};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use anyhow::{bail, Result};
 use pageserver_api::models::HistoricLayerInfo;
 use std::ops::Range;
@@ -21,7 +21,7 @@ use utils::{
 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -63,15 +63,14 @@ impl std::fmt::Debug for RemoteLayer {
    }
 }

-#[async_trait::async_trait]
 impl Layer for RemoteLayer {
-    fn get_value_reconstruct_data_blocking(
+    fn get_value_reconstruct_data(
        &self,
        _key: Key,
        _lsn_range: Range<Lsn>,
-        _reconstruct_state: ValueReconstructState,
-        _ctx: RequestContext,
-    ) -> Result<(ValueReconstructState, ValueReconstructResult)> {
+        _reconstruct_state: &mut ValueReconstructState,
+        _ctx: &RequestContext,
+    ) -> Result<ValueReconstructResult> {
        bail!(
            "layer {} needs to be downloaded",
            self.filename().file_name()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -8,7 +8,7 @@ use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -82,11 +82,12 @@ use self::eviction_task::EvictionTaskTimelineState;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::layer_cache::{LayerCache, LayerDeletionGuard};
 use super::layer_map::BatchedUpdates;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
-    DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc, PersistentLayerKey,
+    DeltaLayer, ImageLayer, Layer, LayerAccessStatsReset, PersistentLayerDesc,
 };

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -120,77 +121,11 @@ impl PartialOrd for Hole {
    }
 }

-pub struct LayerFileManager(HashMap<PersistentLayerKey, Arc<dyn PersistentLayer>>);
+pub struct LayerFileManager(());

 impl LayerFileManager {
-    fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Arc<dyn PersistentLayer> {
-        // The assumption for the `expect()` is that all code maintains the following invariant:
-        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.0
-            .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename().file_name()))
-            .expect("not found")
-            .clone()
-    }
-
-    pub(crate) fn insert(&mut self, layer: Arc<dyn PersistentLayer>) {
-        let present = self.0.insert(layer.layer_desc().key(), layer.clone());
-        if present.is_some() && cfg!(debug_assertions) {
-            panic!("overwriting a layer: {:?}", layer.layer_desc())
-        }
-    }
-
    pub(crate) fn new() -> Self {
-        Self(HashMap::new())
-    }
-
-    pub(crate) fn remove(&mut self, layer: Arc<dyn PersistentLayer>) {
-        let present = self.0.remove(&layer.layer_desc().key());
-        if present.is_none() && cfg!(debug_assertions) {
-            panic!(
-                "removing layer that is not present in layer mapping: {:?}",
-                layer.layer_desc()
-            )
-        }
-    }
-
-    pub(crate) fn replace_and_verify(
-        &mut self,
-        expected: Arc<dyn PersistentLayer>,
-        new: Arc<dyn PersistentLayer>,
-    ) -> Result<()> {
-        let key = expected.layer_desc().key();
-        let other = new.layer_desc().key();
-
-        let expected_l0 = LayerMap::is_l0(expected.layer_desc());
-        let new_l0 = LayerMap::is_l0(new.layer_desc());
-
-        fail::fail_point!("layermap-replace-notfound", |_| anyhow::bail!(
-            "layermap-replace-notfound"
-        ));
-
-        anyhow::ensure!(
-            key == other,
-            "expected and new layer have different keys: {key:?} != {other:?}"
-        );
-
-        anyhow::ensure!(
-            expected_l0 == new_l0,
-            "one layer is l0 while the other is not: {expected_l0} != {new_l0}"
-        );
-
-        if let Some(layer) = self.0.get_mut(&expected.layer_desc().key()) {
-            anyhow::ensure!(
-                compare_arced_layers(&expected, layer),
-                "another layer was found instead of expected, expected={expected:?}, new={new:?}",
-                expected = Arc::as_ptr(&expected),
-                new = Arc::as_ptr(layer),
-            );
-            *layer = new;
-            Ok(())
-        } else {
-            anyhow::bail!("layer was not found");
-        }
+        Self(())
    }
 }

@@ -207,7 +142,7 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 }

 pub struct Timeline {
-    conf: &'static PageServerConf,
+    pub(super) conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

    myself: Weak<Self>,
@@ -217,25 +152,10 @@ pub struct Timeline {

    pub pg_version: u32,

-    /// The tuple has two elements.
-    /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
-    /// 2. `LayerMap`, the acceleration data structure for `get_reconstruct_data`.
-    ///
-    /// `LayerMap` maps out the `(PAGE,LSN) / (KEY,LSN)` space, which is composed of `(KeyRange, LsnRange)` rectangles.
-    /// We describe these rectangles through the `PersistentLayerDesc` struct.
-    ///
-    /// When we want to reconstruct a page, we first find the `PersistentLayerDesc`'s that we need for page reconstruction,
-    /// using `LayerMap`. Then, we use `LayerFileManager` to get the `PersistentLayer`'s that correspond to the
-    /// `PersistentLayerDesc`'s.
-    ///
-    /// Hence, it's important to keep things coherent. The `LayerFileManager` must always have an entry for all
-    /// `PersistentLayerDesc`'s in the `LayerMap`. If it doesn't, `LayerFileManager::get_from_desc` will panic at
-    /// runtime, e.g., during page reconstruction.
-    ///
-    /// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
-    /// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
    pub(crate) layers: Arc<tokio::sync::RwLock<(LayerMap, LayerFileManager)>>,

+    pub(super) layer_cache: LayerCache,
+
    /// Set of key ranges which should be covered by image layers to
    /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
    /// It is used by compaction task when it checks if new image layer should be created.
@@ -307,13 +227,6 @@ pub struct Timeline {
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,

-    /// Layer removal lock.
-    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
-    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
-
    // Needed to ensure that we can't create a branch at a point that was already garbage collected
    pub latest_gc_cutoff_lsn: Rcu<Lsn>,

@@ -660,33 +573,16 @@ impl Timeline {
            None => None,
        };

-        let reconstruct_state = ValueReconstructState {
+        let mut reconstruct_state = ValueReconstructState {
            records: Vec::new(),
            img: cached_page_img,
        };

-        static GET_RECONSTRUCT_DATA_CONCURRENCY: Lazy<Option<usize>> = Lazy::new(|| {
-            std::env::var("PAGESERVER_TIMELINE_GET_RECONSTRUCT_DATA_CONCURRENCY_LIMIT")
-                .ok()
-                .and_then(|s| s.parse().ok())
-        });
-        static GET_RECONSTRUCT_DATA_SEMAPHORE: Lazy<Option<Semaphore>> =
-            Lazy::new(|| (*GET_RECONSTRUCT_DATA_CONCURRENCY).map(Semaphore::new));
-
-        let permit = if let Some(sem) = GET_RECONSTRUCT_DATA_SEMAPHORE.as_ref() {
-            Some(sem.acquire().await)
-        } else {
-            None
-        };
-
        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
-        let reconstruct_state = self
-            .get_reconstruct_data(key, lsn, reconstruct_state, ctx)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();

-        drop(permit);
-
        RECONSTRUCT_TIME
            .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
    }
@@ -942,7 +838,7 @@ impl Timeline {
        // Below are functions compact_level0() and create_image_layers()
        // but they are a bit ad hoc and don't quite work like it's explained
        // above. Rewrite it.
-        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
+        let layer_removal_cs = self.layer_cache.delete_guard().await;
        // Is the timeline being deleted?
        if self.is_stopping() {
            return Err(anyhow::anyhow!("timeline is Stopping").into());
@@ -1165,7 +1061,7 @@ impl Timeline {

    pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
        let guard = self.layers.read().await;
-        let (layer_map, mapping) = &*guard;
+        let (layer_map, _) = &*guard;
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
        if let Some(open_layer) = &layer_map.open_layer {
            in_memory_layers.push(open_layer.info());
@@ -1176,7 +1072,7 @@ impl Timeline {

        let mut historic_layers = Vec::new();
        for historic_layer in layer_map.iter_historic_layers() {
-            let historic_layer = mapping.get_from_desc(&historic_layer);
+            let historic_layer = self.layer_cache.get_from_desc(&historic_layer);
            historic_layers.push(historic_layer.info(reset));
        }

@@ -1274,7 +1170,7 @@ impl Timeline {
            .context("wait for layer upload ops to complete")?;

        // now lock out layer removal (compaction, gc, timeline deletion)
-        let layer_removal_guard = self.layer_removal_cs.lock().await;
+        let layer_removal_guard = self.layer_cache.delete_guard().await;

        {
            // to avoid racing with detach and delete_timeline
@@ -1287,7 +1183,7 @@ impl Timeline {

        // start the batch update
        let mut guard = self.layers.write().await;
-        let (layer_map, mapping) = &mut *guard;
+        let (layer_map, _) = &mut *guard;
        let mut batch_updates = layer_map.batch_update();

        let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1296,12 +1192,7 @@ impl Timeline {
            let res = if cancel.is_cancelled() {
                None
            } else {
-                Some(self.evict_layer_batch_impl(
-                    &layer_removal_guard,
-                    l,
-                    &mut batch_updates,
-                    mapping,
-                ))
+                Some(self.evict_layer_batch_impl(&layer_removal_guard, l, &mut batch_updates))
            };
            results.push(res);
        }
@@ -1317,10 +1208,9 @@ impl Timeline {

    fn evict_layer_batch_impl(
        &self,
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: &LayerDeletionGuard,
        local_layer: &Arc<dyn PersistentLayer>,
        batch_updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
    ) -> anyhow::Result<bool> {
        if local_layer.is_remote_layer() {
            // TODO(issue #3851): consider returning an err here instead of false,
@@ -1371,7 +1261,10 @@ impl Timeline {

        assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());

-        let succeed = match mapping.replace_and_verify(local_layer.clone(), new_remote_layer) {
+        let succeed = match self
+            .layer_cache
+            .replace_and_verify(local_layer.clone(), new_remote_layer)
+        {
            Ok(()) => {
                if let Err(e) = local_layer.delete_resident_layer_file() {
                    error!("failed to remove layer file on evict after replacement: {e:#?}");
@@ -1419,9 +1312,6 @@ impl Timeline {
    }
 }

-/// Number of times we will compute partition within a checkpoint distance.
-const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
-
 // Private functions
 impl Timeline {
    fn get_checkpoint_distance(&self) -> u64 {
@@ -1546,6 +1436,7 @@ impl Timeline {
                    LayerMap::default(),
                    LayerFileManager::new(),
                ))),
+                layer_cache: LayerCache::new(myself.clone()),
                wanted_image_layers: Mutex::new(None),

                walredo_mgr,
@@ -1581,7 +1472,6 @@ impl Timeline {
                layer_flush_done_tx,

                write_lock: tokio::sync::Mutex::new(()),
-                layer_removal_cs: Default::default(),

                gc_info: std::sync::RwLock::new(GcInfo {
                    retain_lsns: Vec::new(),
@@ -1619,8 +1509,7 @@ impl Timeline {
                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
            };
-            result.repartition_threshold =
-                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
+            result.repartition_threshold = result.get_checkpoint_distance() / 10;
            result
                .metrics
                .last_record_gauge
@@ -1738,7 +1627,7 @@ impl Timeline {
    ///
    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;
-        let (layers, mapping) = &mut *guard;
+        let (layers, _) = &mut *guard;
        let mut updates = layers.batch_update();
        let mut num_layers = 0;

@@ -1781,7 +1670,8 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
+                updates.insert_historic(layer.layer_desc().clone());
+                self.layer_cache.populate_local_when_init(Arc::new(layer));
                num_layers += 1;
            } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                // Create a DeltaLayer struct for each delta file.
@@ -1813,7 +1703,8 @@ impl Timeline {

                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
-                self.insert_historic_layer(Arc::new(layer), &mut updates, mapping);
+                updates.insert_historic(layer.layer_desc().clone());
+                self.layer_cache.populate_local_when_init(Arc::new(layer));
                num_layers += 1;
            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                // ignore these
@@ -1868,7 +1759,7 @@ impl Timeline {
        // We're holding a layer map lock for a while but this
        // method is only called during init so it's fine.
        let mut guard = self.layers.write().await;
-        let (layer_map, mapping) = &mut *guard;
+        let (layer_map, _) = &mut *guard;
        let mut updates = layer_map.batch_update();
        for remote_layer_name in &index_part.timeline_layers {
            let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1913,7 +1804,8 @@ impl Timeline {
                        anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                    } else {
                        self.metrics.resident_physical_size_gauge.sub(local_size);
-                        self.remove_historic_layer(local_layer, &mut updates, mapping);
+                        updates.remove_historic(local_layer.layer_desc().clone());
+                        self.layer_cache.remove_local_when_init(local_layer);
                        // fall-through to adding the remote layer
                    }
                } else {
@@ -1952,7 +1844,8 @@ impl Timeline {
                    );
                    let remote_layer = Arc::new(remote_layer);

-                    self.insert_historic_layer(remote_layer, &mut updates, mapping);
+                    updates.insert_historic(remote_layer.layer_desc().clone());
+                    self.layer_cache.populate_remote_when_init(remote_layer);
                }
                LayerFileName::Delta(deltafilename) => {
                    // Create a RemoteLayer for the delta file.
@@ -1979,7 +1872,8 @@ impl Timeline {
                        ),
                    );
                    let remote_layer = Arc::new(remote_layer);
-                    self.insert_historic_layer(remote_layer, &mut updates, mapping);
+                    updates.insert_historic(remote_layer.layer_desc().clone());
+                    self.layer_cache.populate_remote_when_init(remote_layer);
                }
            }
        }
@@ -2020,10 +1914,10 @@ impl Timeline {

        let local_layers = {
            let guard = self.layers.read().await;
-            let (layers, mapping) = &*guard;
+            let (layers, _) = &*guard;
            layers
                .iter_historic_layers()
-                .map(|l| (l.filename(), mapping.get_from_desc(&l)))
+                .map(|l| (l.filename(), self.layer_cache.get_from_desc(&l)))
                .collect::<HashMap<_, _>>()
        };

@@ -2397,52 +2291,27 @@ impl Timeline {

    async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
        let guard = self.layers.read().await;
-        let (layers, mapping) = &*guard;
+        let (layers, _) = &*guard;
        for historic_layer in layers.iter_historic_layers() {
            let historic_layer_name = historic_layer.filename().file_name();
            if layer_file_name == historic_layer_name {
-                return Some(mapping.get_from_desc(&historic_layer));
+                return Some(self.layer_cache.get_from_desc(&historic_layer));
            }
        }

        None
    }

-    /// Helper function to insert a layer from both layer map and layer file manager. Will be removed in the future
-    /// after we introduce `LayerMapManager`.
-    fn insert_historic_layer(
-        &self,
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
-    ) {
-        updates.insert_historic(layer.layer_desc().clone());
-        mapping.insert(layer);
-    }
-
-    /// Helper function to remove a layer from both layer map and layer file manager. Will be removed in the future
-    /// after we introduce `LayerMapManager`.
-    fn remove_historic_layer(
-        &self,
-        layer: Arc<dyn PersistentLayer>,
-        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
-    ) {
-        updates.remove_historic(layer.layer_desc().clone());
-        mapping.remove(layer);
-    }
-
    /// Removes the layer from local FS (if present) and from memory.
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        &self,
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        _layer_removal_cs: LayerDeletionGuard,
        layer: Arc<PersistentLayerDesc>,
        updates: &mut BatchedUpdates<'_>,
-        mapping: &mut LayerFileManager,
    ) -> anyhow::Result<()> {
-        let layer = mapping.get_from_desc(&layer);
+        let layer = self.layer_cache.get_from_desc(&layer);
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
            let layer_file_size = layer.file_size();
@@ -2457,7 +2326,7 @@ impl Timeline {
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
        updates.remove_historic(layer.layer_desc().clone());
-        mapping.remove(layer);
+        self.layer_cache.delete_layer(layer);

        Ok(())
    }
@@ -2512,9 +2381,9 @@ impl Timeline {
        &self,
        key: Key,
        request_lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
+        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
-    ) -> Result<ValueReconstructState, PageReconstructError> {
+    ) -> Result<(), PageReconstructError> {
        // Start from the current timeline.
        let mut timeline_owned;
        let mut timeline = self;
@@ -2544,12 +2413,12 @@ impl Timeline {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
-                ValueReconstructResult::Complete => return Ok(reconstruct_state),
+                ValueReconstructResult::Complete => return Ok(()),
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
-                        return Ok(reconstruct_state);
+                        return Ok(());
                    }
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
@@ -2643,7 +2512,7 @@ impl Timeline {
            'layer_map_search: loop {
                let remote_layer = {
                    let guard = timeline.layers.read().await;
-                    let (layers, mapping) = &*guard;
+                    let (layers, _) = &*guard;

                    // Check the open and frozen in-memory layers first, in order from newest
                    // to oldest.
@@ -2654,19 +2523,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(open_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match open_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2687,19 +2550,13 @@ impl Timeline {
                        if cont_lsn > start_lsn {
                            //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
                            let lsn_floor = max(cached_lsn + 1, start_lsn);
-                            result = match Arc::clone(frozen_layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match frozen_layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -2717,7 +2574,7 @@ impl Timeline {
                    }

                    if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                        let layer = mapping.get_from_desc(&layer);
+                        let layer = timeline.layer_cache.get_from_desc(&layer);
                        // If it's a remote layer, download it and retry.
                        if let Some(remote_layer) =
                            super::storage_layer::downcast_remote_layer(&layer)
@@ -2729,19 +2586,13 @@ impl Timeline {
                            // Get all the data needed to reconstruct the page version from this layer.
                            // But if we have an older cached page image, no need to go past that.
                            let lsn_floor = max(cached_lsn + 1, lsn_floor);
-                            result = match Arc::clone(&layer)
-                                .get_value_reconstruct_data(
-                                    key,
-                                    lsn_floor..cont_lsn,
-                                    reconstruct_state,
-                                    ctx.attached_child(),
-                                )
-                                .await
-                            {
-                                Ok((new_reconstruct_state, result)) => {
-                                    reconstruct_state = new_reconstruct_state;
-                                    result
-                                }
+                            result = match layer.get_value_reconstruct_data(
+                                key,
+                                lsn_floor..cont_lsn,
+                                reconstruct_state,
+                                ctx,
+                            ) {
+                                Ok(result) => result,
                                Err(e) => return Err(PageReconstructError::from(e)),
                            };
                            cont_lsn = lsn_floor;
@@ -3230,14 +3081,15 @@ impl Timeline {
        // Add it to the layer map
        let l = Arc::new(new_delta);
        let mut guard = self.layers.write().await;
-        let (layers, mapping) = &mut *guard;
+        let (layers, _) = &mut *guard;
        let mut batch_updates = layers.batch_update();
        l.access_stats().record_residence_event(
            &batch_updates,
            LayerResidenceStatus::Resident,
            LayerResidenceEventReason::LayerCreate,
        );
-        self.insert_historic_layer(l, &mut batch_updates, mapping);
+        batch_updates.insert_historic(l.layer_desc().clone());
+        self.layer_cache.create_new_layer(l);
        batch_updates.flush();

        // update metrics
@@ -3466,7 +3318,7 @@ impl Timeline {
        let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());

        let mut guard = self.layers.write().await;
-        let (layers, mapping) = &mut *guard;
+        let (layers, _) = &mut *guard;
        let mut updates = layers.batch_update();
        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);

@@ -3488,7 +3340,8 @@ impl Timeline {
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
-            self.insert_historic_layer(l, &mut updates, mapping);
+            updates.insert_historic(l.layer_desc().clone());
+            self.layer_cache.create_new_layer(l);
        }
        updates.flush();
        drop_wlock(guard);
@@ -3644,14 +3497,9 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
 }

 impl Timeline {
-    /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
-    ///
-    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
-    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
-    /// start of level0 files compaction, the on-demand download should be revisited as well.
    fn compact_level0_phase1(
        self: Arc<Self>,
-        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        _layer_removal_cs: LayerDeletionGuard,
        guard: tokio::sync::OwnedRwLockReadGuard<(LayerMap, LayerFileManager)>,
        mut stats: CompactLevel0Phase1StatsBuilder,
        target_file_size: u64,
@@ -3659,11 +3507,11 @@ impl Timeline {
    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
        stats.read_lock_held_spawn_blocking_startup_micros =
            stats.read_lock_acquisition_micros.till_now(); // set by caller
-        let (layers, mapping) = &*guard;
+        let (layers, _) = &*guard;
        let level0_deltas = layers.get_level0_deltas()?;
        let mut level0_deltas = level0_deltas
            .into_iter()
-            .map(|x| mapping.get_from_desc(&x))
+            .map(|x| self.layer_cache.get_from_desc(&x))
            .collect_vec();
        stats.level0_deltas_count = Some(level0_deltas.len());
        // Only compact if enough layers have accumulated.
@@ -4024,7 +3872,7 @@ impl Timeline {
    ///
    async fn compact_level0(
        self: &Arc<Self>,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer_removal_cs: LayerDeletionGuard,
        target_file_size: u64,
        ctx: &RequestContext,
    ) -> Result<(), CompactionError> {
@@ -4079,7 +3927,7 @@ impl Timeline {
        }

        let mut guard = self.layers.write().await;
-        let (layers, mapping) = &mut *guard;
+        let (layers, _) = &mut *guard;
        let mut updates = layers.batch_update();
        let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
        for l in new_layers {
@@ -4111,7 +3959,8 @@ impl Timeline {
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
-            self.insert_historic_layer(x, &mut updates, mapping);
+            updates.insert_historic(x.layer_desc().clone());
+            self.layer_cache.create_new_layer(x);
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
@@ -4119,10 +3968,7 @@ impl Timeline {
        let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
        for l in deltas_to_compact {
            layer_names_to_delete.push(l.filename());
-            // NB: the layer file identified by descriptor `l` is guaranteed to be present
-            // in the LayerFileManager because we kept holding `layer_removal_cs` the entire
-            // time, even though we dropped `Timeline::layers` inbetween.
-            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates, mapping)?;
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
        }
        updates.flush();
        drop_wlock(guard);
@@ -4243,7 +4089,7 @@ impl Timeline {

        fail_point!("before-timeline-gc");

-        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
+        let layer_removal_cs = self.layer_cache.delete_guard().await;
        // Is the timeline being deleted?
        if self.is_stopping() {
            anyhow::bail!("timeline is Stopping");
@@ -4281,7 +4127,7 @@ impl Timeline {

    async fn gc_timeline(
        &self,
-        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
+        layer_removal_cs: LayerDeletionGuard,
        horizon_cutoff: Lsn,
        pitr_cutoff: Lsn,
        retain_lsns: Vec<Lsn>,
@@ -4343,7 +4189,7 @@ impl Timeline {
        //
        // TODO holding a write lock is too agressive and avoidable
        let mut guard = self.layers.write().await;
-        let (layers, mapping) = &mut *guard;
+        let (layers, _) = &mut *guard;
        'outer: for l in layers.iter_historic_layers() {
            result.layers_total += 1;

@@ -4459,7 +4305,6 @@ impl Timeline {
                        layer_removal_cs.clone(),
                        doomed_layer,
                        &mut updates,
-                        mapping,
                    )?; // FIXME: schedule succeeded deletions before returning?
                    result.layers_removed += 1;
                }
@@ -4645,13 +4490,14 @@ impl Timeline {
                    // Download complete. Replace the RemoteLayer with the corresponding
                    // Delta- or ImageLayer in the layer map.
                    let mut guard = self_clone.layers.write().await;
-                    let (layers, mapping) = &mut *guard;
+                    let (layers, _) = &mut *guard;
                    let updates = layers.batch_update();
                    let new_layer =
                        remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
                    {
                        let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        let failure = match mapping.replace_and_verify(l, new_layer) {
+                        let failure = match self_clone.layer_cache.replace_and_verify(l, new_layer)
+                        {
                            Ok(()) => false,
                            Err(e) => {
                                // this is a precondition failure, the layer filename derived
@@ -4780,10 +4626,10 @@ impl Timeline {
        let mut downloads = Vec::new();
        {
            let guard = self.layers.read().await;
-            let (layers, mapping) = &*guard;
+            let (layers, _) = &*guard;
            layers
                .iter_historic_layers()
-                .map(|l| mapping.get_from_desc(&l))
+                .map(|l| self.layer_cache.get_from_desc(&l))
                .filter_map(|l| l.downcast_remote_layer())
                .map(|l| self.download_remote_layer(l))
                .for_each(|dl| downloads.push(dl))
@@ -4885,7 +4731,7 @@ impl LocalLayerInfoForDiskUsageEviction {
 impl Timeline {
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
        let guard = self.layers.read().await;
-        let (layers, mapping) = &*guard;
+        let (layers, _) = &*guard;

        let mut max_layer_size: Option<u64> = None;
        let mut resident_layers = Vec::new();
@@ -4894,7 +4740,7 @@ impl Timeline {
            let file_size = l.file_size();
            max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));

-            let l = mapping.get_from_desc(&l);
+            let l = self.layer_cache.get_from_desc(&l);

            if l.is_remote_layer() {
                continue;
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -198,10 +198,10 @@ impl Timeline {
        // So, we just need to deal with this.
        let candidates: Vec<Arc<dyn PersistentLayer>> = {
            let guard = self.layers.read().await;
-            let (layers, mapping) = &*guard;
+            let (layers, _) = &*guard;
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
-                let hist_layer = mapping.get_from_desc(&hist_layer);
+                let hist_layer = self.layer_cache.get_from_desc(&hist_layer);
                if hist_layer.is_remote_layer() {
                    continue;
                }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -2675,6 +2675,7 @@ bool
 neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 {
 	XLogRecPtr	end_recptr = record->EndRecPtr;
+	XLogRecPtr	prev_end_recptr = record->ReadRecPtr - 1;
 	RelFileNode	rnode;
 	ForkNumber	forknum;
 	BlockNumber	blkno;
@@ -2718,15 +2719,16 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	no_redo_needed = buffer < 0;

-	/* In both cases st lwlsn past this WAL record */
-	SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
-
-	/* we don't have the buffer in memory, update lwLsn past this record,
-	 * also evict page fro file cache
-	 */
+	/* we don't have the buffer in memory, update lwLsn past this record */
 	if (no_redo_needed)
+	{
+		SetLastWrittenLSNForBlock(end_recptr, rnode, forknum, blkno);
 		lfc_evict(rnode, forknum, blkno);
-
+	}
+	else
+	{
+		SetLastWrittenLSNForBlock(prev_end_recptr, rnode, forknum, blkno);
+	}

 	LWLockRelease(partitionLock);

@@ -2734,10 +2736,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	if (get_cached_relsize(rnode, forknum, &relsize))
 	{
 		if (relsize < blkno + 1)
-		{
 			update_cached_relsize(rnode, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);
-		}
 	}
 	else
 	{
@@ -2769,7 +2768,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		Assert(nbresponse->n_blocks > blkno);

 		set_cached_relsize(rnode, forknum, nbresponse->n_blocks);
-		SetLastWrittenLSNForRelation(end_recptr, rnode, forknum);

 		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
 	}
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,6 +1,7 @@
 import shutil
 import time
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Dict, Tuple

 import pytest
@@ -427,14 +428,14 @@ def poor_mans_du(
    largest_layer = 0
    smallest_layer = None
    for tenant_id, timeline_id in timelines:
-        timeline_dir = env.timeline_dir(tenant_id, timeline_id)
-        assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
-        total = 0
-        for file in timeline_dir.iterdir():
+        dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        assert dir.exists(), f"timeline dir does not exist: {dir}"
+        sum = 0
+        for file in dir.iterdir():
            if "__" not in file.name:
                continue
            size = file.stat().st_size
-            total += size
+            sum += size
            largest_layer = max(largest_layer, size)
            if smallest_layer:
                smallest_layer = min(smallest_layer, size)
@@ -442,8 +443,8 @@ def poor_mans_du(
                smallest_layer = size
            log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")

-        log.info(f"{tenant_id}/{timeline_id}: sum {total}")
-        total_on_disk += total
+        log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
+        total_on_disk += sum

    assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
    return (total_on_disk, largest_layer, smallest_layer or 0)
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -58,7 +58,7 @@ def test_basic_eviction(
    for sk in env.safekeepers:
        sk.stop()

-    timeline_path = env.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    initial_local_layers = sorted(
        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
    )
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -535,7 +535,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
            "pitr_interval": "0s",
        }
    )
-    timeline_path = env.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)

    client = env.pageserver.http_client()

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -632,14 +632,14 @@ def test_ignored_tenant_download_missing_layers(

    # ignore the tenant and remove its layers
    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
+    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    layers_removed = False
-    for dir_entry in timeline_dir.iterdir():
+    for dir_entry in tenant_timeline_dir.iterdir():
        if dir_entry.name.startswith("00000"):
            # Looks like a layer file. Remove it
            dir_entry.unlink()
            layers_removed = True
-    assert layers_removed, f"Found no layers for tenant {timeline_dir}"
+    assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"

    # now, load it from the local files and expect it to work due to remote storage restoration
    pageserver_http.tenant_load(tenant_id=tenant_id)
@@ -688,14 +688,14 @@ def test_ignored_tenant_stays_broken_without_metadata(

    # ignore the tenant and remove its metadata
    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
+    tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    metadata_removed = False
-    for dir_entry in timeline_dir.iterdir():
+    for dir_entry in tenant_timeline_dir.iterdir():
        if dir_entry.name == "metadata":
            # Looks like a layer file. Remove it
            dir_entry.unlink()
            metadata_removed = True
-    assert metadata_removed, f"Failed to find metadata file in {timeline_dir}"
+    assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"

    env.pageserver.allowed_errors.append(
        f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -214,7 +214,9 @@ def switch_pg_to_new_pageserver(

    endpoint.start()

-    timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
+    timeline_to_detach_local_path = (
+        env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    )
    files_before_detach = os.listdir(timeline_to_detach_local_path)
    assert (
        "metadata" in files_before_detach
@@ -417,6 +419,8 @@ def test_tenant_relocation(
            new_pageserver_http.tenant_attach(tenant_id)

            # wait for tenant to finish attaching
+            tenant_status = new_pageserver_http.tenant_status(tenant_id=tenant_id)
+            assert tenant_status["state"]["slug"] in ["Attaching", "Active"]
            wait_until(
                number_of_iterations=10,
                interval=1,
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -257,7 +257,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
+    timeline_dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
    local_layer_truncated = None
    for path in Path.iterdir(timeline_dir):
        if path.name.startswith("00000"):
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Alex Chi	19180e167f	remove LayerKey usage Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-28 10:00:02 -04:00
Alex Chi	b2cd142836	fix tests Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-28 09:58:31 -04:00
Alex Chi	f2d7baf0ba	rename DeleteGuard -> LayerDeletionGuard Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 16:57:11 -04:00
Alex Chi	113a4256d4	rename lcache to layer_cache Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 16:55:56 -04:00
Alex Chi	be4999713a	add comments for LayerCache Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 16:54:51 -04:00
Alex Chi	7335f155c3	fmt Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:31:15 -04:00
Alex Chi	a1ca70ff35	Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache	2023-06-27 10:26:13 -04:00
Alex Chi	ce1e57faea	fix merge conflicts Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:25:06 -04:00
Alex Chi	6f50bec781	fix merge conflicts Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:24:48 -04:00
Alex Chi	b981702ecf	Merge branch 'skyzh/layermap-ref-2' of https://github.com/neondatabase/neon into skyzh/layermap-as-cache	2023-06-27 10:23:08 -04:00
Alex Chi	21d30fc43f	use layer_desc key for replace cmp Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:16:30 -04:00
Alex Chi	137ad83f37	fix merge conflicts Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:14:09 -04:00
Alex Chi	22da36bc02	fix merge conflicts Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-27 10:04:21 -04:00
Alex Chi	900ef3d92b	Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2	2023-06-27 10:02:57 -04:00
Alex Chi	b7923fa0be	use new errmsg Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-23 16:27:49 -04:00
Alex Chi	4c4a531d5e	rename LayerMapping -> LayerFileManager Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-23 15:52:09 -04:00
Alex Chi	2b4f96345b	resolve comments Signed-off-by: Alex Chi <chi@neon.tech>	2023-06-23 15:32:57 -04:00
Alex Chi	b775ca8a58	resolve conflicts Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-20 09:52:28 -04:00
Alex Chi	ddb5862be2	Merge branch 'main' of https://github.com/neondatabase/neon into skyzh/layermap-ref-2	2023-06-20 09:12:15 -04:00
Alex Chi	a2056666ae	pgserver: move mapping logic to layer cache Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-14 15:07:38 -04:00
Alex Chi	fc190a2a19	resolve merge conflicts Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 13:56:50 -04:00
Alex Chi	faee3152f3	refactor: use LayerDesc in LayerMap (part 2) Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-06-13 13:54:59 -04:00