pageserver: rate limited warning on too many layers visited

tests: add a compaction smoke test
pageserver: add a metric counting layer visits on vectored read path
2026-06-03 13:30:38 +00:00 · 2024-04-22 17:32:12 +01:00 · 2024-04-22 17:32:12 +01:00 · 2024-04-22 17:32:10 +01:00 · 2024-04-22 17:32:05 +01:00 · 2024-04-22 10:40:35 -04:00
49 changed files with 1075 additions and 1077 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -477,7 +477,6 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -736,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2

      - uses: docker/login-action@v3
        with:
@@ -793,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -866,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1

    steps:
      - name: Checkout
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 "tracing",
 ]
@@ -2519,7 +2519,7 @@ dependencies = [
 "http 0.2.9",
 "hyper 0.14.26",
 "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -4059,7 +4059,7 @@ dependencies = [
 "futures",
 "once_cell",
 "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4350,7 +4350,7 @@ dependencies = [
 "routerify",
 "rstest",
 "rustc-hash",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4542,7 +4542,7 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
@@ -4696,7 +4696,7 @@ dependencies = [
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
@@ -4956,9 +4956,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.21.9"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -4968,9 +4968,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -5282,7 +5282,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -5830,8 +5830,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"

 [[package]]
 name = "syn"
@@ -6193,7 +6192,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
 "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.25.0",
@@ -6206,7 +6205,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 ]

@@ -6216,7 +6215,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6677,7 +6676,7 @@ dependencies = [
 "base64 0.21.1",
 "log",
 "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-webpki 0.100.2",
 "url",
 "webpki-roots 0.23.1",
@@ -7354,7 +7353,7 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "scopeguard",
 "serde",
 "serde_json",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -157,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -129,7 +129,6 @@ pub struct PageServerConf {

    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -142,7 +141,6 @@ impl Default for PageServerConf {
            http_auth_type: AuthType::Trust,
            virtual_file_io_engine: None,
            get_vectored_impl: None,
-            get_impl: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -92,7 +92,6 @@ impl PageServerNode {
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
        } = &self.conf;

        let id = format!("id={}", id);
@@ -112,11 +111,6 @@ impl PageServerNode {
        } else {
            String::new()
        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -130,7 +124,6 @@ impl PageServerNode {
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,15 +1,15 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
-use hyper::Method;
+use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -120,6 +120,12 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
+    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
+    /// mode so that it can warm up content on a pageserver.
+    TenantWarmup {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
 }

 #[derive(Parser)]
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
+        Command::TenantWarmup { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await;
+            match describe_response {
+                Ok(describe) => {
+                    if matches!(describe.policy, PlacementPolicy::Secondary) {
+                        // Fine: it's already known to controller in secondary mode: calling
+                        // again to put it into secondary mode won't cause problems.
+                    } else {
+                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
+                    }
+                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
+                    // Fine: this tenant isn't know to the storage controller yet.
+                }
+                Err(e) => {
+                    // Unexpected API error
+                    return Err(e.into());
+                }
+            }
+
+            vps_client
+                .location_config(
+                    TenantShardId::unsharded(tenant_id),
+                    pageserver_api::models::LocationConfig {
+                        mode: pageserver_api::models::LocationConfigMode::Secondary,
+                        generation: None,
+                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+                        shard_number: 0,
+                        shard_count: 0,
+                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
+                        tenant_conf: TenantConfig::default(),
+                    },
+                    None,
+                    true,
+                )
+                .await?;
+
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            let secondary_ps_id = describe_response
+                .shards
+                .first()
+                .unwrap()
+                .node_secondary
+                .first()
+                .unwrap();
+
+            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
+            loop {
+                let (status, progress) = vps_client
+                    .tenant_secondary_download(
+                        TenantShardId::unsharded(tenant_id),
+                        Some(Duration::from_secs(10)),
+                    )
+                    .await?;
+                println!(
+                    "Progress: {}/{} layers, {}/{} bytes",
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                match status {
+                    StatusCode::OK => {
+                        println!("Download complete");
+                        break;
+                    }
+                    StatusCode::ACCEPTED => {
+                        // Loop
+                    }
+                    _ => {
+                        anyhow::bail!("Unexpected download status: {status}");
+                    }
+                }
+            }
+        }
    }

    Ok(())
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -48,11 +48,11 @@ impl Key {
        }
    }

-    pub const fn next(&self) -> Key {
+    pub fn next(&self) -> Key {
        self.add(1)
    }

-    pub const fn add(&self, x: u32) -> Key {
+    pub fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -483,8 +483,6 @@ pub fn is_inherited_key(key: Key) -> bool {
    key != AUX_FILES_KEY
 }

-pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
-
 #[inline(always)]
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,13 +94,12 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    /// Returns the removed keyspace
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return KeySpace::default();
+                return;
            }
        };

@@ -113,37 +112,30 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

-        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
-                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
-                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
-                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
-                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
-
-        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -178,11 +170,6 @@ impl KeySpace {
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
        self.overlaps_at(range).is_some()
    }
-
-    /// Check if the keyspace contains a key
-    pub fn contains(&self, key: &Key) -> bool {
-        self.overlaps(&(*key..key.next()))
-    }
 }

 ///
@@ -566,16 +553,7 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(2)..Key::from_i128(3),
-                Key::from_i128(6)..Key::from_i128(7),
-                Key::from_i128(11)..Key::from_i128(12),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -605,17 +583,7 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(3)..Key::from_i128(5),
-                Key::from_i128(8)..Key::from_i128(10),
-                Key::from_i128(14)..Key::from_i128(15),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -642,11 +610,7 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace::default();
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -673,17 +637,7 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(9)..Key::from_i128(10),
-                Key::from_i128(12)..Key::from_i128(15),
-                Key::from_i128(17)..Key::from_i128(19),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

@@ -537,6 +538,24 @@ impl ShardIdentity {
        }
    }

+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
    /// Return true if the key should be discarded if found in this shard's
    /// data store, e.g. during compaction after a split.
    ///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
    Ok(intermediate_lsns)
 }

@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
            let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
        }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/utils/src/rate_limit.rs
+++ b/libs/utils/src/rate_limit.rs
@@ -5,6 +5,7 @@ use std::time::{Duration, Instant};
 pub struct RateLimit {
    last: Option<Instant>,
    interval: Duration,
+    rate_limited: u32,
 }

 impl RateLimit {
@@ -12,9 +13,16 @@ impl RateLimit {
        Self {
            last: None,
            interval,
+            rate_limited: 0,
        }
    }

+    /// Returns the number of calls that were rate limited
+    /// since the last one was allowed.
+    pub fn rate_limited(&self) -> u32 {
+        self.rate_limited
+    }
+
    /// Call `f` if the rate limit allows.
    /// Don't call it otherwise.
    pub fn call<F: FnOnce()>(&mut self, f: F) {
@@ -22,9 +30,13 @@ impl RateLimit {
        match self.last {
            Some(last) if now - last <= self.interval => {
                // ratelimit
+                if let Some(updated) = self.rate_limited.checked_add(1) {
+                    self.rate_limited = updated;
+                }
            }
            _ => {
                self.last = Some(now);
+                self.rate_limited = 0;
                f();
            }
        }
@@ -50,17 +62,24 @@ mod tests {

        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 0);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 1);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 1);
+        assert_eq!(f.rate_limited(), 2);
        std::thread::sleep(Duration::from_millis(100));
        f.call(cl);
        assert_eq!(called.load(Relaxed), 2);
+        assert_eq!(f.rate_limited(), 0);
        f.call(cl);
        assert_eq!(called.load(Relaxed), 2);
+        assert_eq!(f.rate_limited(), 1);
+        f.call(cl);
        std::thread::sleep(Duration::from_millis(100));
        f.call(cl);
        assert_eq!(called.load(Relaxed), 3);
+        assert_eq!(f.rate_limited(), 0);
    }
 }
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {

 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
    }

    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
+            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -121,10 +121,8 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice and read path implementations
+    // after setting up logging, log the effective IO engine choice
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.get_impl, "starting with get page implementation");
-    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -30,9 +30,9 @@ use utils::{
    logging::LogFormat,
 };

+use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
-use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -91,8 +91,6 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

-    pub const DEFAULT_GET_IMPL: &str = "legacy";
-
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB

    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
@@ -140,8 +138,6 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

-#get_impl = '{DEFAULT_GET_IMPL}'
-
 #max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
@@ -288,8 +284,6 @@ pub struct PageServerConf {

    pub get_vectored_impl: GetVectoredImpl,

-    pub get_impl: GetImpl,
-
    pub max_vectored_read_bytes: MaxVectoredReadBytes,

    pub validate_vectored_get: bool,
@@ -420,8 +414,6 @@ struct PageServerConfigBuilder {

    get_vectored_impl: BuilderValue<GetVectoredImpl>,

-    get_impl: BuilderValue<GetImpl>,
-
    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,

    validate_vectored_get: BuilderValue<bool>,
@@ -511,7 +503,6 @@ impl PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
-            get_impl: Set(DEFAULT_GET_IMPL.parse().unwrap()),
            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
@@ -690,10 +681,6 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

-    pub fn get_impl(&mut self, value: GetImpl) {
-        self.get_impl = BuilderValue::Set(value);
-    }
-
    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
        self.max_vectored_read_bytes = BuilderValue::Set(value);
    }
@@ -763,7 +750,6 @@ impl PageServerConfigBuilder {
                secondary_download_concurrency,
                ingest_batch_size,
                get_vectored_impl,
-                get_impl,
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
@@ -1049,9 +1035,6 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
-                "get_impl" => {
-                    builder.get_impl(parse_toml_from_str("get_impl", item)?)
-                }
                "max_vectored_read_bytes" => {
                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
                    builder.get_max_vectored_read_bytes(
@@ -1143,7 +1126,6 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-            get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
            max_vectored_read_bytes: MaxVectoredReadBytes(
                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                    .expect("Invalid default constant"),
@@ -1383,7 +1365,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
@@ -1457,7 +1438,6 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
-                get_impl: defaults::DEFAULT_GET_IMPL.parse().unwrap(),
                max_vectored_read_bytes: MaxVectoredReadBytes(
                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                        .expect("Invalid default constant")
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -86,11 +86,6 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.

-use std::{
-    sync::{atomic::AtomicU32, Arc},
-    time::Duration,
-};
-
 use crate::task_mgr::TaskKind;

 pub(crate) mod optional_counter;
@@ -103,156 +98,6 @@ pub struct RequestContext {
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
-    pub read_path_stats: ReadPathStats,
-}
-
-#[derive(Debug, Default, Clone)]
-pub struct ReadPathStats {
-    pub inner: Arc<ReadPathStatsInner>,
-}
-
-#[derive(Debug, Default)]
-pub struct ReadPathStatsInner {
-    pub get_reconstruct_data_time: AtomicU32,
-    pub plan_read_time: AtomicU32,
-    pub read_time: AtomicU32,
-    pub sort_reconstruct_data_time: AtomicU32,
-    pub layer_search_time: AtomicU32,
-    pub keyspace_manipulation_time: AtomicU32,
-    pub in_mem_layer_find_time: AtomicU32,
-    pub fringe_fondle_time: AtomicU32,
-    pub layer_map_search_time: AtomicU32,
-    pub buffer_cache_hits: AtomicU32,
-    pub layers_visited: AtomicU32,
-}
-
-impl std::fmt::Display for ReadPathStatsInner {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(
-            f,
-            "get_reconstruct_data_time={} plan_read_time={} read_time={} sort_time={} layer_search_time={} keyspace_manipulation_time={} in_mem_layer_find_time={} fringe_fondle_time={} layer_map_search_time={} buffer_cache_hits={} layers_visited={}",
-            self.get_reconstruct_data_time.load(std::sync::atomic::Ordering::Relaxed),
-            self.plan_read_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.read_time.load(std::sync::atomic::Ordering::Relaxed),
-            self.sort_reconstruct_data_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.layer_search_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.keyspace_manipulation_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.in_mem_layer_find_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.fringe_fondle_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.layer_map_search_time
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.buffer_cache_hits
-                .load(std::sync::atomic::Ordering::Relaxed),
-            self.layers_visited
-                .load(std::sync::atomic::Ordering::Relaxed)
-        )
-    }
-}
-
-impl ReadPathStatsInner {
-    pub fn reset(&self) {
-        self.get_reconstruct_data_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.plan_read_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.read_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.sort_reconstruct_data_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.layer_search_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.keyspace_manipulation_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.in_mem_layer_find_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.fringe_fondle_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.layer_map_search_time
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.buffer_cache_hits
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-        self.layers_visited
-            .store(0, std::sync::atomic::Ordering::Relaxed);
-    }
-
-    pub fn add_get_reconstruct_data_time(&self, dur: Duration) {
-        self.get_reconstruct_data_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_plan_read_time(&self, dur: Duration) {
-        self.plan_read_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_read_time(&self, dur: Duration) {
-        self.read_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_sort_reconstruct_data_time(&self, dur: Duration) {
-        self.sort_reconstruct_data_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_layer_search_time(&self, dur: Duration) {
-        self.layer_search_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_keyspace_manipulation_timer(&self, dur: Duration) {
-        self.keyspace_manipulation_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_in_mem_layer_find_timer(&self, dur: Duration) {
-        self.in_mem_layer_find_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_fringe_fondle_time(&self, dur: Duration) {
-        self.fringe_fondle_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn add_layer_map_search_time(&self, dur: Duration) {
-        self.layer_map_search_time.fetch_add(
-            dur.as_micros().try_into().unwrap(),
-            std::sync::atomic::Ordering::Relaxed,
-        );
-    }
-
-    pub fn inc_layer_visited(&self) {
-        self.layers_visited
-            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-    }
-
-    pub fn inc_buffer_cache_hits(&self) {
-        self.buffer_cache_hits
-            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-    }
 }

 /// The kind of access to the page cache.
@@ -309,7 +154,6 @@ impl RequestContextBuilder {
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
                micros_spent_throttled: Default::default(),
-                read_path_stats: Default::default(),
            },
        }
    }
@@ -324,7 +168,6 @@ impl RequestContextBuilder {
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
                micros_spent_throttled: Default::default(),
-                read_path_stats: original.read_path_stats.clone(),
            },
        }
    }
@@ -448,12 +291,4 @@ impl RequestContext {
    pub(crate) fn page_content_kind(&self) -> PageContentKind {
        self.page_content_kind
    }
-
-    pub fn report_stats(&self) {
-        tracing::info!("Read path stats: {}", *self.read_path_stats.inner);
-    }
-
-    pub fn reset_stats(&self) {
-        self.read_path_stats.inner.reset();
-    }
 }
--- a/pageserver/src/context/optional_counter.rs
+++ b/pageserver/src/context/optional_counter.rs
@@ -72,7 +72,6 @@ impl MicroSecondsCounterU32 {
            Err(_) => Err("add(): duration conversion error"),
        }
    }
-
    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
        let val = self.inner.close()?;
        let val = Duration::from_micros(val as u64);
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,54 +86,50 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
-#[derive(
-    Clone,
-    Copy,
-    enum_map::Enum,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    strum_macros::IntoStaticStr,
-)]
-pub(crate) enum GetKind {
-    Singular,
-    Vectored,
-}

 pub(crate) struct ReconstructTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
+    ok: Histogram,
+    err: Histogram,
 }

 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["get_kind"],
+        &["result"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric");
-
    ReconstructTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
+        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
    }
 });

 impl ReconstructTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
+    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
+        match result {
+            Ok(_) => &self.ok,
+            Err(_) => &self.err,
        }
    }
 }
@@ -146,33 +142,13 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) struct ReconstructDataTimeMetrics {
-    singular: Histogram,
-    vectored: Histogram,
-}
-
-impl ReconstructDataTimeMetrics {
-    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
-        match get_kind {
-            GetKind::Singular => &self.singular,
-            GetKind::Vectored => &self.vectored,
-        }
-    }
-}
-
-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
-    let inner = register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
-        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric");
-
-    ReconstructDataTimeMetrics {
-        singular: inner.with_label_values(&[GetKind::Singular.into()]),
-        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
-    }
+    .expect("failed to define a metric")
 });

 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -2804,7 +2780,8 @@ pub fn preinitialize_metrics() {

    // histograms
    [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1446,10 +1446,14 @@ impl<'a> DatadirModification<'a> {
                    // reset the map.
                    return Err(e.into());
                }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                    // Key is missing, we must insert an image as the basis for subsequent deltas.

                    let mut dir = AuxFilesDirectory {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -559,9 +559,10 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
        }

        timeline
@@ -3027,7 +3028,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }

@@ -3858,10 +3859,8 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
-    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
-    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};

    static TEST_KEY: Lazy<Key> =
@@ -4650,9 +4649,7 @@ mod tests {
        for read in reads {
            info!("Doing vectored read on {:?}", read);

-            let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
-                .await;
+            let vectored_res = tline.get_vectored_impl(read.clone(), reads_lsn, &ctx).await;
            tline
                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
                .await;
@@ -4661,64 +4658,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
-
-        let (tenant, ctx) = harness.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
-            .await?;
-        let tline = tline.raw_timeline().unwrap();
-
-        let mut modification = tline.begin_modification(Lsn(0x1000));
-        modification.put_file("foo/bar1", b"content1", &ctx).await?;
-        modification.set_lsn(Lsn(0x1008))?;
-        modification.put_file("foo/bar2", b"content2", &ctx).await?;
-        modification.commit(&ctx).await?;
-
-        let child_timeline_id = TimelineId::generate();
-        tenant
-            .branch_timeline_test(
-                tline,
-                child_timeline_id,
-                Some(tline.get_last_record_lsn()),
-                &ctx,
-            )
-            .await?;
-
-        let child_timeline = tenant
-            .get_timeline(child_timeline_id, true)
-            .expect("Should have the branched timeline");
-
-        let aux_keyspace = KeySpace {
-            ranges: vec![NON_INHERITED_RANGE],
-        };
-        let read_lsn = child_timeline.get_last_record_lsn();
-
-        let vectored_res = child_timeline
-            .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
-            .await;
-
-        child_timeline
-            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
-            .await;
-
-        let images = vectored_res?;
-        let mut key = NON_INHERITED_RANGE.start;
-        while key < NON_INHERITED_RANGE.end {
-            assert!(matches!(images[&key], Err(PageReconstructError::Other(_))));
-            key = key.next();
-        }
-
-        Ok(())
-    }
-
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
@@ -4847,12 +4786,7 @@ mod tests {
            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
        };
        let results = child_timeline
-            .get_vectored_impl(
-                read.clone(),
-                current_lsn,
-                ValuesReconstructState::new(),
-                &ctx,
-            )
+            .get_vectored_impl(read.clone(), current_lsn, &ctx)
            .await?;

        for (key, img_res) in results {
@@ -4985,7 +4919,6 @@ mod tests {
                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                    },
                    query_lsn,
-                    ValuesReconstructState::new(),
                    &ctx,
                )
                .await;
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -203,10 +203,7 @@ impl<'a> FileBlockReader<'a> {
                    format!("Failed to read immutable buf: {e:#}"),
                )
            })? {
-            ReadBufResult::Found(guard) => {
-                ctx.read_path_stats.inner.inc_buffer_cache_hits();
-                Ok(guard.into())
-            }
+            ReadBufResult::Found(guard) => Ok(guard.into()),
            ReadBufResult::NotFound(write_guard) => {
                // Read the page from disk into the buffer
                let write_guard = self.fill_buffer(write_guard, blknum).await?;
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
    }
 }

+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -236,6 +236,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -536,9 +537,10 @@ impl RemoteTimelineClient {
    // Upload operations.
    //

-    ///
    /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -550,7 +552,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -566,6 +568,27 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -2024,7 +2047,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,

    keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
        }
    }

@@ -138,27 +140,12 @@ impl ValuesReconstructState {
        }
    }

-    /// This function is called after reading a keyspace from a layer.
-    /// It checks if the read path has now moved past the cached Lsn for any keys.
-    ///
-    /// Implementation note: We intentionally iterate over the keys for which we've
-    /// already collected some reconstruct data. This avoids scaling complexity with
-    /// the size of the search space.
-    pub(crate) fn on_lsn_advanced(&mut self, keyspace: &KeySpace, advanced_to: Lsn) {
-        for (key, value) in self.keys.iter_mut() {
-            if !keyspace.contains(key) {
-                continue;
-            }
+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }

-            if let Ok(state) = value {
-                if state.situation != ValueReconstructSituation::Complete
-                    && state.get_cached_lsn() >= Some(advanced_to)
-                {
-                    state.situation = ValueReconstructSituation::Complete;
-                    self.keys_done.add_key(*key);
-                }
-            }
-        }
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
    }

    /// Update the state collected for a given key.
@@ -185,18 +172,11 @@ impl ValuesReconstructState {
                        true
                    }
                    Value::WalRecord(rec) => {
-                        debug_assert!(
-                            Some(lsn) > state.get_cached_lsn(),
-                            "Attempt to collect a record below cached LSN for walredo: {} < {}",
-                            lsn,
-                            state
-                                .get_cached_lsn()
-                                .expect("Assertion can only fire if a cached lsn is present")
-                        );
-
+                        let reached_cache =
+                            state.get_cached_lsn().map(|clsn| clsn + 1) == Some(lsn);
                        let will_init = rec.will_init();
                        state.records.push((lsn, rec));
-                        will_init
+                        will_init || reached_cache
                    }
                },
            };
@@ -379,28 +359,11 @@ impl ReadableLayer {
    ) -> Result<(), GetVectoredError> {
        match self {
            ReadableLayer::PersistentLayer(layer) => {
-                if keyspace.total_size() == 1 {
-                    tracing::info!(
-                        "Vectored path for {}: {} @ {:?}",
-                        keyspace.start().unwrap(),
-                        layer.local_path(),
-                        lsn_range
-                    );
-                }
-
                layer
                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                    .await
            }
            ReadableLayer::InMemoryLayer(layer) => {
-                if keyspace.total_size() == 1 {
-                    tracing::info!(
-                        "Vectored path for {}: in mem layer @ {:?}",
-                        keyspace.start().unwrap(),
-                        lsn_range
-                    );
-                }
-
                layer
                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
                    .await
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -58,7 +58,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::sync::Arc;
-use std::time::Instant;
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -218,7 +217,6 @@ pub struct DeltaLayerInner {
    // values copied from summary
    index_start_blk: u32,
    index_root_blk: u32,
-    lsn_range: Range<Lsn>,

    file: VirtualFile,
    file_id: FileId,
@@ -730,6 +728,9 @@ impl DeltaLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
            if actual_summary != expected_summary {
                bail!(
                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
@@ -744,7 +745,6 @@ impl DeltaLayerInner {
            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-            lsn_range: actual_summary.lsn_range,
            max_vectored_read_bytes,
        }))
    }
@@ -756,9 +756,6 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let plan_timer = Instant::now();
-
        let mut need_image = true;
        // Scan the page versions backwards, starting from `lsn`.
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
@@ -794,12 +791,6 @@ impl DeltaLayerInner {
            )
            .await?;

-        ctx.read_path_stats
-            .inner
-            .add_plan_read_time(plan_timer.elapsed());
-
-        let read_timer = Instant::now();
-
        let ctx = &RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::DeltaLayerValue)
            .build();
@@ -838,10 +829,6 @@ impl DeltaLayerInner {
            }
        }

-        ctx.read_path_stats
-            .inner
-            .add_read_time(read_timer.elapsed());
-
        // If an older page image is needed to reconstruct the page, let the
        // caller know.
        if need_image {
@@ -865,9 +852,6 @@ impl DeltaLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let plan_timer = Instant::now();
-
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
@@ -885,7 +869,7 @@ impl DeltaLayerInner {
        let data_end_offset = self.index_start_offset();

        let reads = Self::plan_reads(
-            &keyspace,
+            keyspace,
            lsn_range,
            data_end_offset,
            index_reader,
@@ -896,26 +880,14 @@ impl DeltaLayerInner {
        .await
        .map_err(GetVectoredError::Other)?;

-        ctx.read_path_stats
-            .inner
-            .add_plan_read_time(plan_timer.elapsed());
-
-        let read_timer = Instant::now();
-
        self.do_reads_and_update_state(reads, reconstruct_state)
            .await;

-        reconstruct_state.on_lsn_advanced(&keyspace, self.lsn_range.start);
-
-        ctx.read_path_stats
-            .inner
-            .add_read_time(read_timer.elapsed());
-
        Ok(())
    }

    async fn plan_reads<Reader>(
-        keyspace: &KeySpace,
+        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
        data_end_offset: u64,
        index_reader: DiskBtreeReader<Reader, DELTA_KEY_SIZE>,
@@ -1563,7 +1535,7 @@ mod test {

        // Plan and validate
        let vectored_reads = DeltaLayerInner::plan_reads(
-            &keyspace,
+            keyspace.clone(),
            lsn_range.clone(),
            disk_offset,
            reader,
@@ -1815,7 +1787,7 @@ mod test {
            let data_end_offset = inner.index_start_blk as u64 * PAGE_SZ as u64;

            let vectored_reads = DeltaLayerInner::plan_reads(
-                &keyspace,
+                keyspace.clone(),
                entries_meta.lsn_range.clone(),
                data_end_offset,
                index_reader,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -55,7 +55,6 @@ use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::sync::Arc;
-use std::time::Instant;
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
 use tracing::*;
@@ -397,6 +396,8 @@ impl ImageLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;

            if actual_summary != expected_summary {
                bail!(
@@ -423,9 +424,6 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let plan_timer = Instant::now();
-
        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);
@@ -441,12 +439,6 @@ impl ImageLayerInner {
            )
            .await?
        {
-            ctx.read_path_stats
-                .inner
-                .add_plan_read_time(plan_timer.elapsed());
-
-            let read_timer = Instant::now();
-
            let blob = block_reader
                .block_cursor()
                .read_blob(
@@ -460,11 +452,6 @@ impl ImageLayerInner {
            let value = Bytes::from(blob);

            reconstruct_state.img = Some((self.lsn, value));
-
-            ctx.read_path_stats
-                .inner
-                .add_read_time(read_timer.elapsed());
-
            Ok(ValueReconstructResult::Complete)
        } else {
            Ok(ValueReconstructResult::Missing)
@@ -479,27 +466,14 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let plan_timer = Instant::now();
-
        let reads = self
            .plan_reads(keyspace, ctx)
            .await
            .map_err(GetVectoredError::Other)?;

-        ctx.read_path_stats
-            .inner
-            .add_plan_read_time(plan_timer.elapsed());
-
-        let read_timer = Instant::now();
-
        self.do_reads_and_update_state(reads, reconstruct_state)
            .await;

-        ctx.read_path_stats
-            .inner
-            .add_read_time(read_timer.elapsed());
-
        Ok(())
    }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
    opened_at: Instant,

    /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -241,6 +247,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -298,9 +310,6 @@ impl InMemoryLayer {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let read_timer = Instant::now();
-
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

@@ -336,10 +345,6 @@ impl InMemoryLayer {
            }
        }

-        ctx.read_path_stats
-            .inner
-            .add_read_time(read_timer.elapsed());
-
        // release lock on 'inner'

        // If an older page image is needed to reconstruct the page, let the
@@ -362,9 +367,6 @@ impl InMemoryLayer {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        ctx.read_path_stats.inner.inc_layer_visited();
-        let plan_timer = Instant::now();
-
        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
@@ -404,12 +406,6 @@ impl InMemoryLayer {
            }
        }

-        ctx.read_path_stats
-            .inner
-            .add_plan_read_time(plan_timer.elapsed());
-
-        let read_timer = Instant::now();
-
        let keyspace_size = keyspace.total_size();

        let mut completed_keys = HashSet::new();
@@ -442,20 +438,28 @@ impl InMemoryLayer {
            }
        }

-        reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
-
-        ctx.read_path_stats
-            .inner
-            .add_read_time(read_timer.elapsed());
-
        Ok(())
    }
 }

+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
    }
 }

@@ -480,6 +484,12 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
            tenant_shard_id,
@@ -574,6 +584,15 @@ impl InMemoryLayer {
        );
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
                assert!(*lsn < end_lsn);
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -336,12 +336,6 @@ impl Layer {
            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
            .await
-            .map_err(|err| match err {
-                GetVectoredError::Other(err) => GetVectoredError::Other(
-                    err.context(format!("get_values_reconstruct_data for layer {self}")),
-                ),
-                err => err,
-            })
    }

    /// Download the layer if evicted.
@@ -401,6 +395,10 @@ impl Layer {
        &self.0.path
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -523,6 +521,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -728,6 +729,7 @@ impl LayerInner {

        LayerInner {
            conf,
+            path_str: path.to_string().into(),
            path,
            desc,
            timeline: Arc::downgrade(timeline),
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -818,11 +818,13 @@ async fn eviction_cancellation_on_drop() {
    }
 }

+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,14 +16,14 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
+    key::AUX_FILES_KEY,
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -119,8 +119,8 @@ use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

+use super::config::TenantConf;
 use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
-use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -428,6 +428,62 @@ pub(crate) enum PageReconstructError {
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    stuck_at_lsn: bool,
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.stuck_at_lsn {
+            // Records are found in this timeline but no image layer or initial delta record was found.
+            write!(
+                f,
+                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+                write!(f, ", ancestor {}", ancestor_lsn)?;
+            }
+        } else {
+            // No records in this timeline.
+            write!(
+                f,
+                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }

 impl PageReconstructError {
@@ -439,6 +495,7 @@ impl PageReconstructError {
            AncestorLsnTimeout(_) => false,
            Cancelled | AncestorStopping(_) => true,
            WalRedo(_) => false,
+            MissingKey { .. } => false,
        }
    }
 }
@@ -586,19 +643,6 @@ impl From<GetVectoredError> for CreateImageLayersError {
    }
 }

-impl From<GetVectoredError> for PageReconstructError {
-    fn from(e: GetVectoredError) -> Self {
-        match e {
-            GetVectoredError::Cancelled => PageReconstructError::Cancelled,
-            GetVectoredError::InvalidLsn(_) => PageReconstructError::Other(anyhow!("Invalid LSN")),
-            err @ GetVectoredError::Oversized(_) => PageReconstructError::Other(err.into()),
-            err @ GetVectoredError::MissingKey(_) => PageReconstructError::Other(err.into()),
-            err @ GetVectoredError::GetReadyAncestorError(_) => PageReconstructError::from(err),
-            GetVectoredError::Other(err) => PageReconstructError::Other(err),
-        }
-    }
-}
-
 impl From<GetReadyAncestorError> for PageReconstructError {
    fn from(e: GetReadyAncestorError) -> Self {
        use GetReadyAncestorError::*;
@@ -628,23 +672,6 @@ pub enum GetVectoredImpl {
    Vectored,
 }

-#[derive(
-    Eq,
-    PartialEq,
-    Debug,
-    Copy,
-    Clone,
-    strum_macros::EnumString,
-    strum_macros::Display,
-    serde_with::DeserializeFromStr,
-    serde_with::SerializeDisplay,
-)]
-#[strum(serialize_all = "kebab-case")]
-pub enum GetImpl {
-    Legacy,
-    Vectored,
-}
-
 pub(crate) enum WaitLsnWaiter<'a> {
    Timeline(&'a Timeline),
    Tenant,
@@ -706,6 +733,16 @@ impl Timeline {
        key: Key,
        lsn: Lsn,
        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.timeline_get_throttle.throttle(ctx, 1).await;
+        self.get_impl(key, lsn, ctx).await
+    }
+    /// Not subject to [`Self::timeline_get_throttle`].
+    async fn get_impl(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        if !lsn.is_valid() {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
@@ -716,7 +753,13 @@ impl Timeline {
        // page_service.
        debug_assert!(!self.shard_identity.is_key_disposable(&key));

-        self.timeline_get_throttle.throttle(ctx, 1).await;
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );

        // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
        // The cached image can be returned directly if there is no WAL between the cached image
@@ -739,107 +782,22 @@ impl Timeline {
            None => None,
        };

-        let res = match self.conf.get_impl {
-            GetImpl::Legacy => {
-                let reconstruct_state = ValueReconstructState {
-                    records: Vec::new(),
-                    img: cached_page_img,
-                };
-
-                self.get_impl(key, lsn, reconstruct_state, ctx).await
-            }
-            GetImpl::Vectored => {
-                let keyspace = KeySpace {
-                    ranges: vec![key..key.next()],
-                };
-
-                // Initialise the reconstruct state for the key with the cache
-                // entry returned above.
-                let mut reconstruct_state = ValuesReconstructState::new();
-                let mut key_state = VectoredValueReconstructState::default();
-                key_state.img = cached_page_img;
-                reconstruct_state.keys.insert(key, Ok(key_state));
-
-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
-                    .await;
-
-                if self.conf.validate_vectored_get {
-                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                        .await;
-                }
-
-                let key_value = vectored_res?.pop_first();
-                match key_value {
-                    Some((got_key, value)) => {
-                        if got_key != key {
-                            error!(
-                                "Expected {}, but singular vectored get returned {}",
-                                key, got_key
-                            );
-                            Err(PageReconstructError::Other(anyhow!(
-                                "Singular vectored get returned wrong key"
-                            )))
-                        } else {
-                            value
-                        }
-                    }
-                    None => {
-                        error!(
-                            "Expected {}, but singular vectored get returned nothing",
-                            key
-                        );
-                        Err(PageReconstructError::Other(anyhow!(
-                            "Singular vectored get did not return a value for {}",
-                            key
-                        )))
-                    }
-                }
-            }
+        let mut reconstruct_state = ValueReconstructState {
+            records: Vec::new(),
+            img: cached_page_img,
        };

-        if key.to_i128() % 100 < 10 {
-            ctx.report_stats();
-        }
-
-        ctx.reset_stats();
-
-        res
-    }
-
-    /// Not subject to [`Self::timeline_get_throttle`].
-    async fn get_impl(
-        &self,
-        key: Key,
-        lsn: Lsn,
-        mut reconstruct_state: ValueReconstructState,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        // XXX: structured stats collection for layer eviction here.
-        trace!(
-            "get page request for {}@{} from task kind {:?}",
-            key,
-            lsn,
-            ctx.task_kind()
-        );
-
-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(crate::metrics::GetKind::Singular)
-            .start_timer();
-        let get_reconstruct_data_timer = Instant::now();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
        let path = self
            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
-        ctx.read_path_stats
-            .inner
-            .add_get_reconstruct_data_time(get_reconstruct_data_timer.elapsed());
        timer.stop_and_record();

        let start = Instant::now();
        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
        let elapsed = start.elapsed();
        crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(crate::metrics::GetKind::Singular)
+            .for_result(&res)
            .observe(elapsed.as_secs_f64());

        if cfg!(feature = "testing") && res.is_err() {
@@ -852,7 +810,7 @@ impl Timeline {
                writeln!(
                    msg,
                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                )
                .expect("string grows")
            });
@@ -866,6 +824,10 @@ impl Timeline {
    }

    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+    // If we are visiting more than 50 layers to reconstruct one value,
+    // it's likely that something is wrong with compaction. We print a
+    // rate limited warning in that case.
+    pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u64 = 50;

    /// Look up multiple page versions at a given LSN
    ///
@@ -918,9 +880,7 @@ impl Timeline {
                self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
            }
            GetVectoredImpl::Vectored => {
-                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
-                    .await;
+                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;

                if self.conf.validate_vectored_get {
                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
@@ -966,38 +926,18 @@ impl Timeline {
        for range in keyspace.ranges {
            let mut key = range.start;
            while key != range.end {
-                let block = self
-                    .get_impl(key, lsn, ValueReconstructState::default(), ctx)
-                    .await;
+                let block = self.get_impl(key, lsn, ctx).await;

                use PageReconstructError::*;
                match block {
                    Err(Cancelled | AncestorStopping(_)) => {
                        return Err(GetVectoredError::Cancelled)
                    }
-                    Err(Other(err))
-                        if err.to_string().contains("could not find data for key")
-                            && !NON_INHERITED_RANGE.contains(&key) =>
-                    {
-                        return Err(GetVectoredError::MissingKey(key))
-                    }
-                    Err(Other(err))
-                        if err
-                            .to_string()
-                            .contains("downloading evicted layer file failed") =>
-                    {
-                        return Err(GetVectoredError::Other(err))
-                    }
-                    Err(Other(err))
-                        if err
-                            .chain()
-                            .any(|cause| cause.to_string().contains("layer loading failed")) =>
-                    {
-                        // The intent here is to achieve error parity with the vectored read path.
-                        // When vectored read fails to load a layer it fails the whole read, hence
-                        // we mimic this behaviour here to keep the validation happy.
-                        return Err(GetVectoredError::Other(err));
-                    }
+                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
+                    Err(MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        ..
+                    })) => return Err(GetVectoredError::MissingKey(key)),
                    _ => {
                        values.insert(key, block);
                        key = key.next();
@@ -1013,50 +953,47 @@ impl Timeline {
        &self,
        keyspace: KeySpace,
        lsn: Lsn,
-        mut reconstruct_state: ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        use crate::metrics::GetKind;
+        let mut reconstruct_state = ValuesReconstructState::new();

-        let get_kind = if keyspace.total_size() == 1 {
-            GetKind::Singular
-        } else {
-            GetKind::Vectored
-        };
-
-        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
-        let timer = Instant::now();
        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
            .await?;
-        ctx.read_path_stats
-            .inner
-            .add_get_reconstruct_data_time(timer.elapsed());
-        get_data_timer.stop_and_record();

-        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
-            .for_get_kind(get_kind)
-            .start_timer();
        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
        for (key, res) in reconstruct_state.keys {
            match res {
                Err(err) => {
                    results.insert(key, Err(err));
                }
                Ok(state) => {
-                    let timer = Instant::now();
                    let state = ValueReconstructState::from(state);
-                    ctx.read_path_stats
-                        .inner
-                        .add_sort_reconstruct_data_time(timer.elapsed());

                    let reconstruct_res = self.reconstruct_value(key, lsn, state).await;
                    results.insert(key, reconstruct_res);
                }
            }
        }
-        reconstruct_timer.stop_and_record();
+
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        let average_layers_visited = layers_visited as f64 / results.len() as f64;
+
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(average_layers_visited);
+
+        if average_layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD as f64 {
+            use utils::rate_limit::RateLimit;
+            static LOGGED: Lazy<Mutex<RateLimit>> =
+                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+            let mut rate_limit = LOGGED.lock().unwrap();
+            let rate_limited = rate_limit.rate_limited();
+            rate_limit.call(|| {
+                warn!("Too many layers visited by vectored read: {} >= {}; rate limited {} occurences",
+                      average_layers_visited, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
+            });
+        }

        Ok(results)
    }
@@ -1092,10 +1029,10 @@ impl Timeline {
                panic!(concat!("Sequential get failed with {}, but vectored get did not",
                               " - keyspace={:?} lsn={}"),
                       seq_err, keyspace, lsn) },
-            (Ok(seq_ok), Err(vec_err)) => {
+            (Ok(_), Err(vec_err)) => {
                panic!(concat!("Vectored get failed with {}, but sequential get did not",
-                               " - keyspace={:?} lsn={} seq_ok={:?}"),
-                       vec_err, keyspace, lsn, seq_ok) },
+                               " - keyspace={:?} lsn={}"),
+                       vec_err, keyspace, lsn) },
            (Err(seq_err), Err(vec_err)) => {
                assert!(errors_match(seq_err, vec_err),
                        "Mismatched errors: {seq_err} != {vec_err} - keyspace={keyspace:?} lsn={lsn}")},
@@ -2838,7 +2775,7 @@ impl Timeline {
    }
 }

-type TraversalId = String;
+type TraversalId = Arc<str>;

 trait TraversalLayerExt {
    fn traversal_id(&self) -> TraversalId;
@@ -2846,13 +2783,13 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.local_path_str())
    }
 }

 impl TraversalLayerExt for Arc<InMemoryLayer> {
    fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
    }
 }

@@ -2881,7 +2818,18 @@ impl Timeline {
        let mut timeline = self;

        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64);
+            if cnt >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                let rate_limited = rate_limit.rate_limited();
+                rate_limit.call(|| {
+                    warn!("Too many layers visited by non-vectored read: {} >= {}; rate limited {} occurences",
+                          cnt, Self::LAYERS_VISITED_WARN_THRESHOLD, rate_limited);
+                });
+            }
        });

        // For debugging purposes, collect the path of layers that we traversed
@@ -2921,32 +2869,35 @@ impl Timeline {
                        if prev <= cont_lsn {
                            // Didn't make any progress in last iteration. Error out to avoid
                            // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: true,
                                key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                        }
                    }
                    prev_lsn = Some(cont_lsn);
                }
                ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                        traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                }
            }

@@ -2969,7 +2920,6 @@ impl Timeline {

            // Check the open and frozen in-memory layers first, in order from newest
            // to oldest.
-            let in_mem_layer_find_timer = Instant::now();
            if let Some(open_layer) = &layers.open_layer {
                let start_lsn = open_layer.get_lsn_range().start;
                if cont_lsn > start_lsn {
@@ -2981,10 +2931,6 @@ impl Timeline {
                    let open_layer = open_layer.clone();
                    drop(guard);

-                    ctx.read_path_stats
-                        .inner
-                        .add_in_mem_layer_find_timer(in_mem_layer_find_timer.elapsed());
-
                    result = match open_layer
                        .get_value_reconstruct_data(
                            key,
@@ -2998,12 +2944,8 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                    continue 'outer;
                }
            }
@@ -3016,10 +2958,6 @@ impl Timeline {
                    let frozen_layer = frozen_layer.clone();
                    drop(guard);

-                    ctx.read_path_stats
-                        .inner
-                        .add_in_mem_layer_find_timer(in_mem_layer_find_timer.elapsed());
-
                    result = match frozen_layer
                        .get_value_reconstruct_data(
                            key,
@@ -3033,22 +2971,13 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                    continue 'outer;
                }
            }

-            let layer_map_search_timer = Instant::now();
            if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn) {
-                ctx.read_path_stats
-                    .inner
-                    .add_layer_map_search_time(layer_map_search_timer.elapsed());
-
                let layer = guard.get_from_desc(&layer);
                drop(guard);

@@ -3064,29 +2993,14 @@ impl Timeline {
                };
                cont_lsn = lsn_floor;
                *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                continue 'outer;
            } else if timeline.ancestor_timeline.is_some() {
-                ctx.read_path_stats
-                    .inner
-                    .add_layer_map_search_time(layer_map_search_timer.elapsed());
-
                // Nothing on this timeline. Traverse to parent
                result = ValueReconstructResult::Continue;
                cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
                continue 'outer;
            } else {
-                ctx.read_path_stats
-                    .inner
-                    .add_layer_map_search_time(layer_map_search_timer.elapsed());
-
                // Nothing found
                result = ValueReconstructResult::Missing;
                continue 'outer;
@@ -3132,29 +3046,6 @@ impl Timeline {
            .await?;

            keyspace.remove_overlapping_with(&completed);
-
-            // Do not descend into the ancestor timeline for aux files.
-            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
-            // stalling compaction.
-            // TODO(chi): this will need to be updated for aux files v2 storage
-            if keyspace.overlaps(&NON_INHERITED_RANGE) {
-                let removed = keyspace.remove_overlapping_with(&KeySpace {
-                    ranges: vec![NON_INHERITED_RANGE],
-                });
-
-                for range in removed.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        reconstruct_state.on_key_error(
-                            key,
-                            // TODO: use PageReconstructError::Missing once #7393 merges
-                            PageReconstructError::Other(anyhow!("Value for {} not found", key)),
-                        );
-                        key = key.next();
-                    }
-                }
-            }
-
            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
                break;
            }
@@ -3206,96 +3097,60 @@ impl Timeline {
                return Err(GetVectoredError::Cancelled);
            }

-            let keyspace_manip_timer = Instant::now();
-
            let keys_done_last_step = reconstruct_state.consume_done_keys();
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

-            ctx.read_path_stats
-                .inner
-                .add_keyspace_manipulation_timer(keyspace_manip_timer.elapsed());
+            let guard = timeline.layers.read().await;
+            let layers = guard.layer_map();

-            if unmapped_keyspace.start().is_some() {
-                let layer_search_timer = Instant::now();
+            let in_memory_layer = layers.find_in_memory_layer(|l| {
+                let start_lsn = l.get_lsn_range().start;
+                cont_lsn > start_lsn
+            });

-                let guard = timeline.layers.read().await;
-                let layers = guard.layer_map();
+            match in_memory_layer {
+                Some(l) => {
+                    let lsn_range = l.get_lsn_range().start..cont_lsn;
+                    fringe.update(
+                        ReadableLayer::InMemoryLayer(l),
+                        unmapped_keyspace.clone(),
+                        lsn_range,
+                    );
+                }
+                None => {
+                    for range in unmapped_keyspace.ranges.iter() {
+                        let results = layers.range_search(range.clone(), cont_lsn);

-                let in_mem_layer_find_timer = Instant::now();
-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
-                ctx.read_path_stats
-                    .inner
-                    .add_in_mem_layer_find_timer(in_mem_layer_find_timer.elapsed());
-
-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-
-                        let fringe_fondle_timer = Instant::now();
-
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                        ctx.read_path_stats
-                            .inner
-                            .add_fringe_fondle_time(fringe_fondle_timer.elapsed());
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let layer_map_search_timer = Instant::now();
-
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            ctx.read_path_stats
-                                .inner
-                                .add_layer_map_search_time(layer_map_search_timer.elapsed());
-
-                            let fringe_fondle_timer = Instant::now();
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-
-                            ctx.read_path_stats
-                                .inner
-                                .add_fringe_fondle_time(fringe_fondle_timer.elapsed());
-                        }
+                        results
+                            .found
+                            .into_iter()
+                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                (
+                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                    keyspace_accum.to_keyspace(),
+                                    lsn_floor..cont_lsn,
+                                )
+                            })
+                            .for_each(|(layer, keyspace, lsn_range)| {
+                                fringe.update(layer, keyspace, lsn_range)
+                            });
                    }
                }
-
-                // It's safe to drop the layer map lock after planning the next round of reads.
-                // The fringe keeps readable handles for the layers which are safe to read even
-                // if layers were compacted or flushed.
-                //
-                // The more interesting consideration is: "Why is the read algorithm still correct
-                // if the layer map changes while it is operating?". Doing a vectored read on a
-                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-                // covered by the read. The layer map tells us how to move the lsn downwards for a
-                // range at *a particular point in time*. It is fine for the answer to be different
-                // at two different time points.
-                drop(guard);
-
-                ctx.read_path_stats
-                    .inner
-                    .add_layer_search_time(layer_search_timer.elapsed());
            }

+            // It's safe to drop the layer map lock after planning the next round of reads.
+            // The fringe keeps readable handles for the layers which are safe to read even
+            // if layers were compacted or flushed.
+            //
+            // The more interesting consideration is: "Why is the read algorithm still correct
+            // if the layer map changes while it is operating?". Doing a vectored read on a
+            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+            // covered by the read. The layer map tells us how to move the lsn downwards for a
+            // range at *a particular point in time*. It is fine for the answer to be different
+            // at two different time points.
+            drop(guard);
+
            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                let next_cont_lsn = lsn_range.start;
                layer_to_read
@@ -3309,6 +3164,8 @@ impl Timeline {

                unmapped_keyspace = keyspace_to_read;
                cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
            } else {
                break;
            }
@@ -3752,7 +3609,7 @@ impl Timeline {
        &self,
        disk_consistent_lsn: Lsn,
        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3769,19 +3626,10 @@ impl Timeline {
            None
        };

-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
            disk_consistent_lsn,
            ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
            *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
        );

        fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3793,10 +3641,10 @@ impl Timeline {
            for layer in layers_to_upload {
                remote_client.schedule_layer_file_upload(layer)?;
            }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
        }

-        Ok(metadata)
+        Ok(())
    }

    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
@@ -4069,15 +3917,6 @@ impl Timeline {
                                    {
                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
                                        ZERO_PAGE.clone()
-                                    } else if NON_INHERITED_RANGE.contains(&img_key)
-                                        && matches!(err, PageReconstructError::Other(_))
-                                    {
-                                        // Non inherited keys (i.e. aux files) are special. We
-                                        // might fail to reconstruct the image since we are
-                                        // prohibited from visiting the ancestor timelines.
-                                        // We don't want to stall compaction, so we simply ignore
-                                        // these failures.
-                                        continue;
                                    } else {
                                        return Err(CreateImageLayersError::PageReconstructError(
                                            err,
@@ -4901,35 +4740,7 @@ impl Timeline {
    }
 }

-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

 struct TimelineWriterState {
    open_layer: Arc<InMemoryLayer>,
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -200,6 +200,12 @@ struct ProxyCliArgs {
    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
    #[clap(long, default_value = "4194304")]
    metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -584,6 +590,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
        aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
    }));

    Ok(config)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -33,6 +33,8 @@ pub struct ProxyConfig {
    pub region: String,
    pub handshake_timeout: Duration,
    pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_to_compute_retry_config: RetryConfig,
 }

 #[derive(Debug)]
@@ -517,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
    }
 }

+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Cplane has timeout of 60s on each request.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct WakeComputeLockOptions {
    /// The number of shards the lock map should have
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -119,6 +119,10 @@ pub struct ProxyMetrics {

    /// Number of invalid endpoints (per protocol, per rejected).
    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
 }

 #[derive(MetricGroup)]
@@ -480,3 +484,16 @@ pub struct InvalidEndpointsGroup {
    pub rejected: Bool,
    pub outcome: ConnectOutcome,
 }
+
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -308,6 +308,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        &TcpMechanism { params: &params },
        &user_info,
        mode.allow_self_signed_compute(config),
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
    )
    .or_else(|e| stream.throw_error(e))
    .await?;
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,10 +1,11 @@
 use crate::{
    auth::backend::ComputeCredentialKeys,
    compute::{self, PostgresConnection},
+    config::RetryConfig,
    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
    context::RequestMonitoring,
    error::ReportableError,
-    metrics::{ConnectionFailureKind, Metrics},
+    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
    proxy::{
        retry::{retry_after, ShouldRetry},
        wake_compute::wake_compute,
@@ -93,19 +94,23 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
    mechanism: &M,
    user_info: &B,
    allow_self_signed_compute: bool,
+    wake_compute_retry_config: RetryConfig,
+    connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
    let mut num_retries = 0;
-    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    let mut node_info =
+        wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
    if let Some(keys) = user_info.get_keys() {
        node_info.set_keys(keys);
    }
    node_info.allow_self_signed_compute = allow_self_signed_compute;
    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
    mechanism.update_connect_config(&mut node_info.config);
+    let retry_type = RetryType::ConnectToCompute;

    // try once
    let err = match mechanism
@@ -114,6 +119,13 @@ where
    {
        Ok(res) => {
            ctx.latency_timer.success();
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Success,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
            return Ok(res);
        }
        Err(e) => e,
@@ -124,7 +136,7 @@ where
    let node_info = if !node_info.cached() {
        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
        // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries) {
+        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
            return Err(err.into());
        }
        node_info
@@ -132,7 +144,8 @@ where
        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
        info!("compute node's state has likely changed; requesting a wake-up");
        let old_node_info = invalidate_cache(node_info);
-        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        let mut node_info =
+            wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
        node_info.reuse_settings(old_node_info);

        mechanism.update_connect_config(&mut node_info.config);
@@ -151,19 +164,34 @@ where
        {
            Ok(res) => {
                ctx.latency_timer.success();
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    num_retries.into(),
+                );
+                info!(?num_retries, "connected to compute node after");
                return Ok(res);
            }
            Err(e) => {
-                let retriable = e.should_retry(num_retries);
+                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
                if !retriable {
                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    Metrics::get().proxy.retries_metric.observe(
+                        RetriesMetricGroup {
+                            outcome: ConnectOutcome::Failed,
+                            retry_type,
+                        },
+                        num_retries.into(),
+                    );
                    return Err(e.into());
                }
                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
            }
        }

-        let wait_duration = retry_after(num_retries);
+        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
        num_retries += 1;

        time::sleep(wait_duration).await;
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,18 +1,12 @@
-use crate::compute;
+use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;

-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
 pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
+    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            _ if num_retries >= config.max_retries => false,
            err => err.could_retry(),
        }
    }
@@ -63,6 +57,8 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+    config
+        .base_delay
+        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -10,13 +10,13 @@ use super::*;
 use crate::auth::backend::{
    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
-use crate::config::CertResolver;
+use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
+use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..NUM_RETRIES_CONNECT {
-        total_wait += retry_after(num_retries);
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    for num_retries in 1..config.max_retries {
+        total_wait += retry_after(num_retries, config);
    }
-    assert!(total_wait < tokio::time::Duration::from_secs(12));
-    assert!(total_wait > tokio::time::Duration::from_secs(10));
+    assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
 }

 #[derive(Clone, Copy, Debug)]
@@ -549,7 +553,12 @@ async fn connect_to_compute_success() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -562,7 +571,12 @@ async fn connect_to_compute_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
@@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
    let _ = env_logger::try_init();
-    assert_eq!(NUM_RETRIES_CONNECT, 16);
+    tokio::time::pause();
    use ConnectAction::*;
    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![
-        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
-    ]);
+    let mechanism =
+        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
+    let wake_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 1,
+        backoff_factor: 2.0,
+    };
+    let connect_to_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(
+        &mut ctx,
+        &mechanism,
+        &user_info,
+        false,
+        wake_compute_retry_config,
+        connect_to_compute_retry_config,
+    )
+    .await
+    .unwrap_err();
    mechanism.verify();
 }

@@ -622,7 +661,12 @@ async fn wake_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -636,7 +680,12 @@ async fn wake_non_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,10 +1,14 @@
+use crate::config::RetryConfig;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
+    WakeupFailureKind,
+};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
-use tracing::{error, warn};
+use tracing::{error, info, warn};

 use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
@@ -13,23 +17,42 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
    num_retries: &mut u32,
    ctx: &mut RequestMonitoring,
    api: &B,
+    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
+    let retry_type = RetryType::WakeCompute;
    loop {
        let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries) {
+        match handle_try_wake(wake_res, *num_retries, config) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                report_error(&e, false);
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Failed,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
                return Err(e);
            }
            Ok(ControlFlow::Continue(e)) => {
                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                report_error(&e, true);
            }
-            Ok(ControlFlow::Break(n)) => return Ok(n),
+            Ok(ControlFlow::Break(n)) => {
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
+                info!(?num_retries, "compute node woken up after");
+                return Ok(n);
+            }
        }

-        let wait_duration = retry_after(*num_retries);
+        let wait_duration = retry_after(*num_retries, config);
        *num_retries += 1;
        tokio::time::sleep(wait_duration).await;
    }
@@ -42,10 +65,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 pub fn handle_try_wake(
    result: Result<CachedNodeInfo, WakeComputeError>,
    num_retries: u32,
+    config: RetryConfig,
 ) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
    match result {
        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
                Ok(ControlFlow::Continue(err))
            }
            _ => Err(err),
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -108,6 +108,8 @@ impl PoolingBackend {
            },
            &backend,
            false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
        )
        .await
    }
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
    *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_layers_visited_per_read_global"),
    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
    *histogram("pageserver_wait_lsn_seconds"),
    *histogram("pageserver_remote_operation_seconds"),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -507,11 +507,6 @@ class NeonEnvBuilder:
            self.pageserver_get_vectored_impl = "vectored"
            log.debug('Overriding pageserver get_vectored_impl config to "vectored"')

-        self.pageserver_get_impl: Optional[str] = None
-        if os.getenv("PAGESERVER_GET_IMPL", "") == "vectored":
-            self.pageserver_get_impl = "vectored"
-            log.debug('Overriding pageserver get_impl config to "vectored"')
-
        assert test_name.startswith(
            "test_"
        ), "Unexpectedly instantiated from outside a test function"
@@ -1083,8 +1078,6 @@ class NeonEnv:
                ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
            if config.pageserver_get_vectored_impl is not None:
                ps_cfg["get_vectored_impl"] = config.pageserver_get_vectored_impl
-            if config.pageserver_get_impl is not None:
-                ps_cfg["get_impl"] = config.pageserver_get_impl

            # Create a corresponding NeonPageserver object
            self.pageservers.append(
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -17,16 +17,11 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_local_corruption(neon_env_builder: NeonEnvBuilder):
-    if neon_env_builder.pageserver_get_impl == "vectored":
-        reconstruct_function_name = "get_values_reconstruct_data"
-    else:
-        reconstruct_function_name = "get_value_reconstruct_data"
-
    env = neon_env_builder.init_start()

    env.pageserver.allowed_errors.extend(
        [
-            f".*{reconstruct_function_name} for layer .*",
+            ".*get_value_reconstruct_data for layer .*",
            ".*could not find data for key.*",
            ".*is not active. Current state: Broken.*",
            ".*will not become active. Current state: Broken.*",
@@ -89,7 +84,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
    # (We don't check layer file contents on startup, when loading the timeline)
    #
    # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match=f"{reconstruct_function_name} for layer ") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
        pg2.start()
    log.info(
        f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.workload import Workload
+
+AGGRESIVE_COMPACTION_TENANT_CONF = {
+    # Disable gc and compaction. The test runs compaction manually.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 2,
+    # INC-186: remove when merging the fix
+    "image_layer_creation_check_threshold": 0,
+}
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    """
+    This is a smoke test that compaction kicks in. The workload repeatedly churns
+    a small number of rows and manually instructs the pageserver to run compaction
+    between iterations. At the end of the test validate that the average number of
+    layers visited to gather reconstruct data for a given key is within the empirically
+    observed bounds.
+    """
+
+    # Effectively disable the page cache to rely only on image layers
+    # to shorten reads.
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
+"""
+
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 100
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    log.info("Checking layer access metrics ...")
+
+    layer_access_metric_names = [
+        "pageserver_layers_visited_per_read_global_sum",
+        "pageserver_layers_visited_per_read_global_count",
+        "pageserver_layers_visited_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
+    ]
+
+    metrics = env.pageserver.http_client().get_metrics()
+    for name in layer_access_metric_names:
+        layer_access_metrics = metrics.query_all(name)
+        log.info(f"Got metrics: {layer_access_metrics}")
+
+    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
+    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
+    non_vectored_average = non_vectored_sum / non_vectored_count
+
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_average = vectored_sum / vectored_count
+
+    log.info(f"{non_vectored_average=} {vectored_average=}")
+
+    # The upper bound for average number of layer visits below (8)
+    # was chosen empirically for this workload.
+    assert non_vectored_average < 8
+    assert vectored_average < 8
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -226,11 +226,6 @@ def test_forward_compatibility(
    )

    try:
-        # Previous version neon_local and pageserver are not aware
-        # of the new config.
-        # TODO: remove this once the code reaches main
-        neon_env_builder.pageserver_get_impl = None
-
        neon_env_builder.num_safekeepers = 3
        neon_local_binpath = neon_env_builder.neon_binpath
        env = neon_env_builder.from_repo_dir(
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -1,5 +1,4 @@
 import time
-import re
 from datetime import datetime, timedelta, timezone

 from fixtures.log_helper import log
@@ -110,11 +109,6 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):

 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
-    if neon_env_builder.pageserver_get_impl == "vectored":
-        key_not_found_error = r".*Requested key.*not found,*"
-    else:
-        key_not_found_error = r".*could not find data for key.*"
-
    env = neon_env_builder.init_start()

    new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
@@ -183,8 +177,8 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
            raise RuntimeError("there should have been an 'could not find data for key' error")
        except PageserverApiException as error:
            assert error.status_code == 500
-            assert re.match(key_not_found_error, str(error))
-            env.pageserver.allowed_errors.append(key_not_found_error)
+            assert str(error).startswith("could not find data for key")
+            env.pageserver.allowed_errors.append(".*could not find data for key.*")

        # Probe a bunch of timestamps in the valid range
        step_size = 100
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
        max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
        diff = max_lsn - min_lsn
        assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -273,7 +273,8 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
    but imports the generation number.
    """

-    neon_env_builder.num_pageservers = 2
+    # One pageserver to simulate legacy environment, two to be managed by storage controller
+    neon_env_builder.num_pageservers = 3

    # Start services by hand so that we can skip registration on one of the pageservers
    env = neon_env_builder.init_configs()
@@ -288,10 +289,10 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
    )
    origin_ps = env.pageservers[0]

-    # This is the pageserver managed by the sharding service, where the tenant
+    # These are the pageservers managed by the sharding service, where the tenant
    # will be attached after onboarding
    env.pageservers[1].start()
-    dest_ps = env.pageservers[1]
+    env.pageservers[2].start()
    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)

    for sk in env.safekeepers:
@@ -330,6 +331,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
        )

        virtual_ps_http.tenant_secondary_download(tenant_id)
+        warm_up_ps = env.storage_controller.tenant_describe(tenant_id)["shards"][0][
+            "node_secondary"
+        ][0]

    # Call into storage controller to onboard the tenant
    generation += 1
@@ -344,6 +348,18 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
    )
    assert len(r["shards"]) == 1

+    describe = env.storage_controller.tenant_describe(tenant_id)["shards"][0]
+    dest_ps_id = describe["node_attached"]
+    dest_ps = env.get_pageserver(dest_ps_id)
+    if warm_up:
+        # The storage controller should have attached the tenant to the same placce
+        # it had a secondary location, otherwise there was no point warming it up
+        assert dest_ps_id == warm_up_ps
+
+        # It should have been given a new secondary location as well
+        assert len(describe["node_secondary"]) == 1
+        assert describe["node_secondary"][0] != warm_up_ps
+
    # As if doing a live migration, detach the original pageserver
    origin_ps.http_client().tenant_location_conf(
        tenant_id,
@@ -415,6 +431,9 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
        dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
    )
    dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
+
+    # Storage controller auto-sets heatmap period, ignore it for the comparison
+    del dest_tenant_conf_after.tenant_specific_overrides["heatmap_period"]
    assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf

    env.storage_controller.consistency_check()
Author	SHA1	Message	Date
Vlad Lazar	82e3866c27	pageserver: rate limited warning on too many layers visited	2024-04-22 17:32:12 +01:00
Vlad Lazar	7a8effc190	tests: add a compaction smoke test	2024-04-22 17:32:12 +01:00
Vlad Lazar	35e299d308	pageserver: add a metric counting layer visits on vectored read path	2024-04-22 17:32:10 +01:00
Vlad Lazar	ea5b36c47a	pageserver: tweak metric counting layer visits on non-vectored read path * Give it a better name * Track all layers * Larger buckets	2024-04-22 17:32:05 +01:00
Alex Chi Z	25d9dc6eaf	chore(pageserver): separate missing key error (#7393 ) As part of https://github.com/neondatabase/neon/pull/7375 and to improve the current vectored get implementation, we separate the missing key error out. This also saves us several Box allocations in the get page implementation. ## Summary of changes * Create a caching field of layer traversal id for each of the layer. * Remove box allocations for layer traversal id retrieval and implement MissingKey error message as before. This should be a little bit faster. * Do not format error message until `Display`. * For in-mem layer, the descriptor is different before/after frozen. I'm using once lock for that. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-04-22 10:40:35 -04:00
Christian Schwarz	139d1346d5	pagectl draw-timeline-dir: include layer file name as an SVG comment (#7455 ) fixes https://github.com/neondatabase/neon/issues/7452 Also, drive-by improve the usage instructions with commands I found useful during that incident. The patch in the fork of `svg_fmt` is [being upstreamed](https://github.com/nical/rust_debug/pull/4), but, in the meantime, let's commit what we have because it was useful during the incident.	2024-04-22 12:55:17 +00:00
John Spray	0bd16182f7	pageserver: fix unlogged relations with sharding (#7454 ) ## Problem - #7451 INIT_FORKNUM blocks must be stored on shard 0 to enable including them in basebackup. This issue can be missed in simple tests because creating an unlogged table isn't sufficient -- to repro I had to create an _index_ on an unlogged table (then restart the endpoint). Closes: #7451 ## Summary of changes - Add a reproducer for the issue. - Tweak the condition for `key_is_shard0` to include anything that isn't a normal relation block _and_ any normal relation block whose forknum is INIT_FORKNUM. - To enable existing databases to recover from the issue, add a special case that omits relations if they were stored on the wrong INITFORK. This enables postgres to start and the user to drop the table and recreate it.	2024-04-22 11:47:24 +00:00
Anna Khanova	6a5650d40c	proxy: Make retries configurable and record it. (#7438 ) ## Problem Currently we cannot configure retries, also, we don't really have visibility of what's going on there. ## Summary of changes * Added cli params * Improved logging * Decrease the number of retries: it feels like most of retries doesn't help. Once there would be better errors handling, we can increase it back.	2024-04-22 11:37:22 +00:00
Joonas Koivunen	47addc15f1	relaxation: allow using layers across timelines (#7453 ) Before, we asserted that a layer would only be loaded by the timeline that initially created it. Now, with the ancestor detach, we will want to utilize remote copy as much as possible, so we will need to open other timeline layers as our own. Cc: #6994	2024-04-22 13:04:37 +03:00
Joonas Koivunen	b91c58a8bf	refactor(Timeline): simpler metadata updates (#7422 ) Currently, any `Timeline::schedule_uploads` will generate a fresh `TimelineMetadata` instead of updating the values, which it means to update. This makes it impossible for #6994 to work while `Timeline` receives layer flushes by overwriting any configured new `ancestor_timeline_id` and possible `ancestor_lsn`. The solution is to only make full `TimelineMetadata` "updates" from one place: branching. At runtime, update only the three fields, same as before in `Timeline::schedule_updates`.	2024-04-22 11:57:14 +03:00
Heikki Linnakangas	00d9c2d9a8	Make another walcraft test more robust (#7439 ) There were two issues with the test at page boundaries: 1. If the first logical message with 10 bytes payload crossed a page boundary, the calculated 'base_size' was too large because it included the page header. 2. If it was inserted near the end of a page so that there was not enough room for another one, we did "remaining_lsn += XLOG_BLCKSZ" but that didn't take into account the page headers either. As a result, the test would fail if the WAL insert position at the beginning of the test was too close to the end of a WAL page. Fix the calculations by repeating the 10-byte logical message if the starting position is not suitable. I bumped into this with PR #7377; it changed the arguments of a few SQL functions in neon_test_utils extension, which changed the WAL positions slightly, and caused a test failure. This is similar to https://github.com/neondatabase/neon/pull/7436, but for different test.	2024-04-22 10:58:28 +03:00
Heikki Linnakangas	3a673dce67	Make test less sensitive to exact WAL positions (#7436 ) As noted in the comment, the craft_internal() function fails if the inserted WAL happens to land at page boundary. I bumped into that with PR #7377; it changed the arguments of a few SQL functions in neon_test_utils extension, which changed the WAL positions slightly, and caused a test failure.	2024-04-22 10:58:10 +03:00
Em Sharnoff	35e9fb360b	Bump vm-builder v0.23.2 -> v0.28.1 (#7433 ) Only one relevant change, from v0.28.0: - neondatabase/autoscaling#887 Double-checked with `git log neonvm/tools/vm-builder`.	2024-04-21 17:35:01 -07:00
Heikki Linnakangas	0d21187322	update rustls ## Problem `cargo deny check` is complaining about our rustls versions, causing CI to fail: ``` error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input ┌─ /__w/neon/neon/Cargo.lock:395:1 │ 395 │ rustls 0.21.9 registry+https://github.com/rust-lang/crates.io-index │ ------------------------------------------------------------------- security vulnerability detected │ = ID: RUSTSEC-2024-0336 = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336 = If a `close_notify` alert is received during a handshake, `complete_io` does not terminate. Callers which do not call `complete_io` are not affected. `rustls-tokio` and `rustls-ffi` do not call `complete_io` and are not affected. `rustls::Stream` and `rustls::StreamOwned` types use `complete_io` and are affected. = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`) error[vulnerability]: `rustls::ConnectionCommon::complete_io` could fall into an infinite loop based on network input ┌─ /__w/neon/neon/Cargo.lock:396:1 │ 396 │ rustls 0.22.2 registry+https://github.com/rust-lang/crates.io-index │ ------------------------------------------------------------------- security vulnerability detected │ = ID: RUSTSEC-2024-0336 = Advisory: https://rustsec.org/advisories/RUSTSEC-2024-0336 = If a `close_notify` alert is received during a handshake, `complete_io` does not terminate. Callers which do not call `complete_io` are not affected. `rustls-tokio` and `rustls-ffi` do not call `complete_io` and are not affected. `rustls::Stream` and `rustls::StreamOwned` types use `complete_io` and are affected. = Announcement: https://github.com/rustls/rustls/security/advisories/GHSA-6g7w-8wpp-frhj = Solution: Upgrade to >=0.23.5 OR >=0.22.4, <0.23.0 OR >=0.21.11, <0.22.0 (try `cargo update -p rustls`) ``` ## Summary of changes `cargo update -p rustls@0.21.9 -p rustls@0.22.2`	2024-04-21 21:10:05 +01:00
Alexander Bayandin	e8a98adcd0	CI: downgrade docker/setup-buildx-action to v2 - Cleanup part for `docker/setup-buildx-action` started to fail with the following error (for no obvious reason): ``` /nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175 throw new Error(`Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved.`); ^ Error: Path Validation Error: Path(s) specified in the action for caching do(es) not exist, hence no cache is being saved. at Object.rejected (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:175:1) at Generator.next (<anonymous>) at fulfilled (/nvme/actions-runner/_work/_actions/docker/setup-buildx-action/v3/webpack:/docker-setup-buildx/node_modules/@actions/cache/lib/cache.js:29:1) ``` - Downgrade `docker/setup-buildx-action` from v3 to v2	2024-04-21 21:10:05 +01:00
John Spray	98be8b9430	storcon_cli: `tenant-warmup` command (#7432 ) ## Problem When we migrate a large existing tenant, we would like to be able to ensure it has pre-loaded layers onto a pageserver managed by the storage controller. ## Summary of changes - Add `storcon_cli tenant-warmup`, which configures the tenant into PlacementPolicy::Secondary (unless it's already attached), and then polls the secondary download API reporting progress. - Extend a test case to check that when onboarding with a secondary location pre-created, we properly use that location for our first attachment.	2024-04-19 12:32:58 +01:00