Update pageserver/src/tenant.rs

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
feat(pageserver): remove aux v1 keyspace if user fully switches to v2
2026-05-25 00:50:36 +00:00 · 2024-05-21 13:32:57 -04:00 · 2024-05-21 11:21:27 -04:00 · 2024-05-21 18:13:54 +03:00 · 2024-05-21 09:18:11 -05:00 · 2024-05-21 09:18:11 -05:00
46 changed files with 1650 additions and 392 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -17,6 +17,7 @@
 !libs/
 !neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
 !s3_scrubber/
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1072,9 +1072,9 @@ dependencies = [

 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
 dependencies = [
 "android-tzdata",
 "iana-time-zone",
@@ -1082,7 +1082,7 @@ dependencies = [
 "num-traits",
 "serde",
 "wasm-bindgen",
- "windows-targets 0.48.0",
+ "windows-targets 0.52.4",
 ]

 [[package]]
@@ -1109,7 +1109,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
 "ciborium-io",
- "half",
+ "half 1.8.2",
 ]

 [[package]]
@@ -2278,6 +2278,17 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"

+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
 [[package]]
 name = "hash32"
 version = "0.3.1"
@@ -3902,12 +3913,13 @@ dependencies = [

 [[package]]
 name = "parquet"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "ahash",
 "bytes",
 "chrono",
+ "half 2.4.1",
 "hashbrown 0.14.5",
 "num",
 "num-bigint",
@@ -3916,12 +3928,13 @@ dependencies = [
 "thrift",
 "twox-hash",
 "zstd",
+ "zstd-sys",
 ]

 [[package]]
 name = "parquet_derive"
-version = "49.0.0"
-source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+version = "51.0.0"
+source = "git+https://github.com/apache/arrow-rs?branch=master#2534976a564be3d2d56312dc88fb1b6ed4cef829"
 dependencies = [
 "parquet",
 "proc-macro2",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,8 +122,8 @@ opentelemetry = "0.20.0"
 opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
-parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
-parquet_derive = "49.0.0"
+parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "51.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 procfs = "0.14"
@@ -244,8 +244,8 @@ tonic-build = "0.9"
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 # bug fixes for UUID
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
-parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
+parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }

 ################# Binary contents sections

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \
+    echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-[![Neon](https://user-images.githubusercontent.com/13738772/236813940-dcfdcb5b-69d3-449b-a686-013febe834d4.png)](https://neon.tech)
+[![Neon](https://github.com/neondatabase/neon/assets/11527560/f15a17f0-836e-40c5-b35d-030606a6b660)](https://neon.tech)
+
+

 # Neon

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -243,9 +243,13 @@ impl StorageController {
                anyhow::bail!("initdb failed with status {status}");
            }

+            // Write a minimal config file:
+            // - Specify the port, since this is chosen dynamically
+            // - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
+            //   the storage controller we don't want a slow local disk to interfere with that.
            tokio::fs::write(
                &pg_data_path.join("postgresql.conf"),
-                format!("port = {}", self.postgres_port),
+                format!("port = {}\nfsync=off\n", self.postgres_port),
            )
            .await?;
        };
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -307,7 +307,7 @@ impl KeySpace {
    }

    /// Merge another keyspace into the current one.
-    /// Note: the keyspaces must not ovelap (enforced via assertions)
+    /// Note: the keyspaces must not overlap (enforced via assertions). To merge overlapping key ranges, use `KeySpaceRandomAccum`.
    pub fn merge(&mut self, other: &KeySpace) {
        let all_ranges = self
            .ranges
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -820,10 +820,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        Ok(ProcessMsgResult::Continue)
    }

-    /// Log as info/error result of handling COPY stream and send back
-    /// ErrorResponse if that makes sense. Shutdown the stream if we got
-    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
-    /// close.
+    /// - Log as info/error result of handling COPY stream and send back
+    ///   ErrorResponse if that makes sense.
+    /// - Shutdown the stream if we got Terminate.
+    /// - Then close the connection because we don't handle exiting from COPY
+    ///   stream normally.
    pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
        use CopyStreamHandlerEnd::*;

@@ -849,10 +850,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            }
        }

-        if let Terminate = &end {
-            self.state = ProtoState::Closed;
-        }
-
        let err_to_send_and_errcode = match &end {
            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
            Other(_) => Some((format!("{end:#}"), SQLSTATE_INTERNAL_ERROR)),
@@ -882,6 +879,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                error!("failed to send ErrorResponse: {}", ee);
            }
        }
+
+        // Proper COPY stream finishing to continue using the connection is not
+        // implemented at the server side (we don't need it so far). To prevent
+        // further usages of the connection, close it.
+        self.framed.shutdown().await.ok();
+        self.state = ProtoState::Closed;
    }
 }

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -359,7 +359,7 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
            // Is there enough space on the page for another logical message and an
            // XLOG_SWITCH? If not, start over.
            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+            if page_remain < base_size + XLOG_SIZE_OF_XLOG_RECORD as u64 {
                continue;
            }

--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -50,6 +50,9 @@ pub struct SkTimelineInfo {
    pub safekeeper_connstr: Option<String>,
    #[serde(default)]
    pub http_connstr: Option<String>,
+    // Minimum of all active RO replicas flush LSN
+    #[serde(default = "lsn_invalid")]
+    pub standby_horizon: Lsn,
 }

 #[derive(Debug, Clone, Deserialize, Serialize)]
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -496,9 +496,9 @@ mod tests {
                // TODO: When updating Postgres versions, this test will cause
                // problems. Postgres version in message needs updating.
                //
-                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160002, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
+                // Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160003, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
                vec![
-                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 3, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
                    147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
                    188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -525,6 +525,15 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_standby_horizon",
+        "Standby apply LSN for which GC is hold off, by timeline.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
@@ -2098,6 +2107,7 @@ pub(crate) struct TimelineMetrics {
    pub garbage_collect_histo: StorageTimeMetrics,
    pub find_gc_cutoffs_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
+    pub standby_horizon_gauge: IntGauge,
    pub resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
@@ -2167,6 +2177,9 @@ impl TimelineMetrics {
        let last_record_gauge = LAST_RECORD_LSN
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        let standby_horizon_gauge = STANDBY_HORIZON
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
@@ -2212,6 +2225,7 @@ impl TimelineMetrics {
            find_gc_cutoffs_histo,
            load_layer_map_histo,
            last_record_gauge,
+            standby_horizon_gauge,
            resident_physical_size_gauge,
            current_logical_size_gauge,
            aux_file_size_gauge,
@@ -2246,6 +2260,7 @@ impl TimelineMetrics {
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -40,7 +40,11 @@ use utils::bin_ser::DeserializeError;
 use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

-const MAX_AUX_FILE_DELTAS: usize = 1024;
+/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
+pub const MAX_AUX_FILE_DELTAS: usize = 1024;
+
+/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;

 #[derive(Debug)]
 pub enum LsnForTimestamp {
@@ -869,8 +873,12 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
+
+        // Remove v1 keyspace if the user has fully switched to v2.
+        if self.last_aux_file_policy.load() != Some(AuxFilePolicy::V2) {
+            if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+                result.add_key(AUX_FILES_KEY);
+            }
        }

        Ok((
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3968,9 +3968,9 @@ mod tests {
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
    use crate::DEFAULT_PG_VERSION;
-    use bytes::BytesMut;
+    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::CompactionAlgorithm;
    use rand::{thread_rng, Rng};
@@ -4777,7 +4777,12 @@ mod tests {
            info!("Doing vectored read on {:?}", read);

            let vectored_res = tline
-                .get_vectored_impl(read.clone(), reads_lsn, ValuesReconstructState::new(), &ctx)
+                .get_vectored_impl(
+                    read.clone(),
+                    reads_lsn,
+                    &mut ValuesReconstructState::new(),
+                    &ctx,
+                )
                .await;
            tline
                .validate_get_vectored_impl(&vectored_res, read, reads_lsn, &ctx)
@@ -4826,7 +4831,7 @@ mod tests {
            .get_vectored_impl(
                aux_keyspace.clone(),
                read_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                &ctx,
            )
            .await;
@@ -4971,7 +4976,7 @@ mod tests {
            .get_vectored_impl(
                read.clone(),
                current_lsn,
-                ValuesReconstructState::new(),
+                &mut ValuesReconstructState::new(),
                &ctx,
            )
            .await?;
@@ -5106,7 +5111,7 @@ mod tests {
                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                    },
                    query_lsn,
-                    ValuesReconstructState::new(),
+                    &mut ValuesReconstructState::new(),
                    &ctx,
                )
                .await;
@@ -5547,7 +5552,7 @@ mod tests {
            .await?;

        const NUM_KEYS: usize = 1000;
-        const STEP: usize = 100; // random update + scan base_key + idx * STEP
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP

        let cancel = CancellationToken::new();

@@ -5580,7 +5585,7 @@ mod tests {

        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));

-        for _ in 0..10 {
+        for iter in 0..=10 {
            // Read all the blocks
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = (blknum * STEP) as u32;
@@ -5595,7 +5600,7 @@ mod tests {
                .get_vectored_impl(
                    keyspace.clone(),
                    lsn,
-                    ValuesReconstructState::default(),
+                    &mut ValuesReconstructState::default(),
                    &ctx,
                )
                .await?
@@ -5631,17 +5636,91 @@ mod tests {
                updated[blknum] = lsn;
            }

-            // Perform a cycle of flush, compact, and GC
-            tline.freeze_and_flush().await?;
-            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
-            tenant
-                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
-                .await?;
+            // Perform two cycles of flush, compact, and GC
+            for round in 0..2 {
+                tline.freeze_and_flush().await?;
+                tline
+                    .compact(
+                        &cancel,
+                        if iter % 5 == 0 && round == 0 {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        } else {
+                            EnumSet::empty()
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                tenant
+                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                    .await?;
+            }
        }

        Ok(())
    }

+    #[tokio::test]
+    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_compaction_trigger")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        for _ in 0..20 {
+            lsn = Lsn(lsn.0 + 0x10);
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+            tline.freeze_and_flush().await?; // force create a delta layer
+        }
+
+        let before_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
+
+        let after_num_l0_delta_files = tline
+            .layers
+            .read()
+            .await
+            .layer_map()
+            .get_level0_deltas()?
+            .len();
+
+        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");
+
+        assert_eq!(
+            tline.get(test_key, lsn, &ctx).await?,
+            test_img(&format!("{} at {}", 0, lsn))
+        );
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn test_branch_copies_dirty_aux_file_flag() {
        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag").unwrap();
@@ -5916,5 +5995,379 @@ mod tests {
            files.get("pg_logical/mappings/test3"),
            Some(&bytes::Bytes::from_static(b"last"))
        );
+
+        // Check that we are going to remove v1 aux files.
+        let (mut dense_keyspace, _) = tline.collect_keyspace(lsn, &ctx).await.unwrap();
+        assert!(dense_keyspace.remove_overlapping_with(&KeySpace::single(AUX_FILES_KEY..AUX_FILES_KEY.next())).is_empty());
+    }
+
+    #[tokio::test]
+    async fn test_metadata_image_creation() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_metadata_image_creation")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        const NUM_KEYS: usize = 1000;
+        const STEP: usize = 10000; // random update + scan base_key + idx * STEP
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        let mut test_key = base_key;
+        let mut lsn = Lsn(0x10);
+
+        async fn scan_with_statistics(
+            tline: &Timeline,
+            keyspace: &KeySpace,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
+            let mut reconstruct_state = ValuesReconstructState::default();
+            let res = tline
+                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .await?;
+            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for blknum in 0..NUM_KEYS {
+            lsn = Lsn(lsn.0 + 0x10);
+            test_key.field6 = (blknum * STEP) as u32;
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+        }
+
+        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));
+
+        for iter in 1..=10 {
+            for _ in 0..NUM_KEYS {
+                lsn = Lsn(lsn.0 + 0x10);
+                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                test_key.field6 = (blknum * STEP) as u32;
+                let mut writer = tline.writer().await;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(lsn);
+                drop(writer);
+            }
+
+            tline.freeze_and_flush().await?;
+
+            if iter % 5 == 0 {
+                let (_, before_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                tline
+                    .compact(
+                        &cancel,
+                        {
+                            let mut flags = EnumSet::new();
+                            flags.insert(CompactFlags::ForceImageLayerCreation);
+                            flags.insert(CompactFlags::ForceRepartition);
+                            flags
+                        },
+                        &ctx,
+                    )
+                    .await?;
+                let (_, after_delta_file_accessed) =
+                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
+                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
+                // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
+                assert!(
+                    after_delta_file_accessed <= 2,
+                    "after_delta_file_accessed={after_delta_file_accessed}"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(base_key, lsn, &Value::Image(test_img("data key 1")), &ctx)
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create a image layer
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("data key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?; // this will create a delta
+
+            {
+                // update the partitioning to include the test key space, otherwise they
+                // will be dropped by image layer creation
+                let mut guard = child.partitioning.lock().await;
+                let ((partitioning, _), partition_lsn) = &mut *guard;
+                partitioning
+                    .parts
+                    .push(KeySpace::single(base_key..base_key_nonexist)); // exclude the nonexist key
+                *partition_lsn = lsn;
+            }
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for the keys, TODO: check if the image layer is created
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
+            .await
+            .unwrap_err()
+            .is_missing_key_error());
+        assert!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            Some(test_img("data key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("data key 2"))
+        );
+        assert!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
+                .await
+                .unwrap_err()
+                .is_missing_key_error()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        let cancel = CancellationToken::new();
+
+        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+        let mut base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
+        let mut base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();
+        base_key.field1 = AUX_KEY_PREFIX;
+        base_key_child.field1 = AUX_KEY_PREFIX;
+        base_key_nonexist.field1 = AUX_KEY_PREFIX;
+
+        let mut lsn = Lsn(0x20);
+
+        {
+            let mut writer = tline.writer().await;
+            writer
+                .put(
+                    base_key,
+                    lsn,
+                    &Value::Image(test_img("metadata key 1")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            tline.freeze_and_flush().await?; // this will create an image layer
+
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        let child = tenant
+            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
+            .await
+            .unwrap();
+
+        lsn.0 += 0x10;
+
+        {
+            let mut writer = child.writer().await;
+            writer
+                .put(
+                    base_key_child,
+                    lsn,
+                    &Value::Image(test_img("metadata key 2")),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(lsn);
+            drop(writer);
+
+            child.freeze_and_flush().await?;
+
+            child
+                .compact(
+                    &cancel,
+                    {
+                        let mut set = EnumSet::empty();
+                        set.insert(CompactFlags::ForceImageLayerCreation);
+                        set.insert(CompactFlags::ForceRepartition);
+                        set
+                    },
+                    &ctx,
+                )
+                .await?; // force create an image layer for metadata keys
+            tenant
+                .gc_iteration(Some(child.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
+                .await?;
+        }
+
+        async fn get_vectored_impl_wrapper(
+            tline: &Arc<Timeline>,
+            key: Key,
+            lsn: Lsn,
+            ctx: &RequestContext,
+        ) -> Result<Option<Bytes>, GetVectoredError> {
+            let mut reconstruct_state = ValuesReconstructState::new();
+            let mut res = tline
+                .get_vectored_impl(
+                    KeySpace::single(key..key.next()),
+                    lsn,
+                    &mut reconstruct_state,
+                    ctx,
+                )
+                .await?;
+            Ok(res.pop_last().map(|(k, v)| {
+                assert_eq!(k, key);
+                v.unwrap()
+            }))
+        }
+
+        // test vectored get on parent timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
+            Some(test_img("metadata key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        // test vectored get on child timeline
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+
+        Ok(())
    }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -62,14 +62,10 @@ use super::{
    CommandRequest, DownloadCommand,
 };

-/// For each tenant, how long must have passed since the last download_tenant call before
-/// calling it again.  This is approximately the time by which local data is allowed
-/// to fall behind remote data.
-///
-/// TODO: this should just be a default, and the actual period should be controlled
-/// via the heatmap itself
-/// `<ttps://github.com/neondatabase/neon/issues/6200>`
-const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+/// For each tenant, default period for how long must have passed since the last download_tenant call before
+/// calling it again.  This default is replaced with the value of [`HeatMapTenant::upload_period_ms`] after first
+/// download, if the uploader populated it.
+const DEFAULT_DOWNLOAD_INTERVAL: Duration = Duration::from_millis(60000);

 /// Range of concurrency we may use when downloading layers within a timeline.  This is independent
 /// for each tenant we're downloading: the concurrency of _tenants_ is defined separately in
@@ -152,14 +148,22 @@ pub(super) struct SecondaryDetailTimeline {
    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }

+// Aspects of a heatmap that we remember after downloading it
+#[derive(Clone, Debug)]
+struct DownloadSummary {
+    etag: Etag,
+    #[allow(unused)]
+    mtime: SystemTime,
+    upload_period: Duration,
+}
+
 /// This state is written by the secondary downloader, it is opaque
 /// to TenantManager
 #[derive(Debug)]
 pub(super) struct SecondaryDetail {
    pub(super) config: SecondaryLocationConfig,

-    last_download: Option<Instant>,
-    last_etag: Option<Etag>,
+    last_download: Option<DownloadSummary>,
    next_download: Option<Instant>,
    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
@@ -189,7 +193,6 @@ impl SecondaryDetail {
        Self {
            config,
            last_download: None,
-            last_etag: None,
            next_download: None,
            timelines: HashMap::new(),
        }
@@ -243,9 +246,8 @@ impl SecondaryDetail {

 struct PendingDownload {
    secondary_state: Arc<SecondaryTenant>,
-    last_download: Option<Instant>,
+    last_download: Option<DownloadSummary>,
    target_time: Option<Instant>,
-    period: Option<Duration>,
 }

 impl scheduler::PendingJob for PendingDownload {
@@ -295,10 +297,17 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

        tracing::debug!("Secondary tenant download completed");

-        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
-        // take priority to run again.
        let mut detail = secondary_state.detail.lock().unwrap();
-        detail.next_download = Some(Instant::now() + period_jitter(DOWNLOAD_FRESHEN_INTERVAL, 5));
+
+        let period = detail
+            .last_download
+            .as_ref()
+            .map(|d| d.upload_period)
+            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+        // We advance next_download irrespective of errors: we don't want error cases to result in
+        // expensive busy-polling.
+        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
    }

    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -331,11 +340,11 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    if detail.next_download.is_none() {
                        // Initialize randomly in the range from 0 to our interval: this uniformly spreads the start times.  Subsequent
                        // rounds will use a smaller jitter to avoid accidentally synchronizing later.
-                        detail.next_download = Some(now.checked_add(period_warmup(DOWNLOAD_FRESHEN_INTERVAL)).expect(
+                        detail.next_download = Some(now.checked_add(period_warmup(DEFAULT_DOWNLOAD_INTERVAL)).expect(
                        "Using our constant, which is known to be small compared with clock range",
                    ));
                    }
-                    (detail.last_download, detail.next_download.unwrap())
+                    (detail.last_download.clone(), detail.next_download.unwrap())
                };

                if now > next_download {
@@ -343,7 +352,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                        secondary_state: secondary_tenant,
                        last_download,
                        target_time: Some(next_download),
-                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
                    })
                } else {
                    None
@@ -369,7 +377,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

        Ok(PendingDownload {
            target_time: None,
-            period: None,
            last_download: None,
            secondary_state: tenant,
        })
@@ -386,7 +393,6 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
            secondary_state,
            last_download,
            target_time,
-            period,
        } = job;

        let (completion, barrier) = utils::completion::channel();
@@ -423,20 +429,15 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo

            // If the job had a target execution time, we may check our final execution
            // time against that for observability purposes.
-            if let (Some(target_time), Some(period)) = (target_time, period) {
-                // Only track execution lag if this isn't our first download: otherwise, it is expected
-                // that execution will have taken longer than our configured interval, for example
-                // when starting up a pageserver and
-                if last_download.is_some() {
-                    // Elapsed time includes any scheduling lag as well as the execution of the job
-                    let elapsed = Instant::now().duration_since(target_time);
+            if let (Some(target_time), Some(last_download)) = (target_time, last_download) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = Instant::now().duration_since(target_time);

-                    warn_when_period_overrun(
-                        elapsed,
-                        period,
-                        BackgroundLoopKind::SecondaryDownload,
-                    );
-                }
+                warn_when_period_overrun(
+                    elapsed,
+                    last_download.upload_period,
+                    BackgroundLoopKind::SecondaryDownload,
+                );
            }

            CompleteDownload {
@@ -525,12 +526,12 @@ impl<'a> TenantDownloader<'a> {
        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();

        // We will use the etag from last successful download to make the download conditional on changes
-        let last_etag = self
+        let last_download = self
            .secondary_state
            .detail
            .lock()
            .unwrap()
-            .last_etag
+            .last_download
            .clone();

        // Download the tenant's heatmap
@@ -539,7 +540,7 @@ impl<'a> TenantDownloader<'a> {
            etag: heatmap_etag,
            bytes: heatmap_bytes,
        } = match tokio::select!(
-            bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
+            bytes = self.download_heatmap(last_download.as_ref().map(|d| &d.etag)) => {bytes?},
            _ = self.secondary_state.cancel.cancelled() => return Ok(())
        ) {
            HeatMapDownload::Unmodified => {
@@ -568,6 +569,39 @@ impl<'a> TenantDownloader<'a> {
            heatmap.timelines.len()
        );

+        // Get or initialize the local disk state for the timelines we will update
+        let mut timeline_states = HashMap::new();
+        for timeline in &heatmap.timelines {
+            let timeline_state = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .timelines
+                .get(&timeline.timeline_id)
+                .cloned();
+
+            let timeline_state = match timeline_state {
+                Some(t) => t,
+                None => {
+                    // We have no existing state: need to scan local disk for layers first.
+                    let timeline_state =
+                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+
+                    // Re-acquire detail lock now that we're done with async load from local FS
+                    self.secondary_state
+                        .detail
+                        .lock()
+                        .unwrap()
+                        .timelines
+                        .insert(timeline.timeline_id, timeline_state.clone());
+                    timeline_state
+                }
+            };
+
+            timeline_states.insert(timeline.timeline_id, timeline_state);
+        }
+
        // Clean up any local layers that aren't in the heatmap.  We do this first for all timelines, on the general
        // principle that deletions should be done before writes wherever possible, and so that we can use this
        // phase to initialize our SecondaryProgress.
@@ -578,6 +612,10 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
+            let timeline_state = timeline_states
+                .remove(&timeline.timeline_id)
+                .expect("Just populated above");
+
            if self.secondary_state.cancel.is_cancelled() {
                tracing::debug!(
                    "Cancelled before downloading timeline {}",
@@ -587,7 +625,7 @@ impl<'a> TenantDownloader<'a> {
            }

            let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, ctx)
+            self.download_timeline(timeline, timeline_state, ctx)
                .instrument(tracing::info_span!(
                    "secondary_download_timeline",
                    tenant_id=%tenant_shard_id.tenant_id,
@@ -599,7 +637,30 @@ impl<'a> TenantDownloader<'a> {

        // Only update last_etag after a full successful download: this way will not skip
        // the next download, even if the heatmap's actual etag is unchanged.
-        self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
+        self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
+            etag: heatmap_etag,
+            mtime: heatmap_mtime,
+            upload_period: heatmap
+                .upload_period_ms
+                .map(|ms| Duration::from_millis(ms as u64))
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL),
+        });
+
+        // Robustness: we should have updated progress properly, but in case we didn't, make sure
+        // we don't leave the tenant in a state where we claim to have successfully downloaded
+        // everything, but our progress is incomplete.  The invariant here should be that if
+        // we have set `last_download` to this heatmap's etag, then the next time we see that
+        // etag we can safely do no work (i.e. we must be complete).
+        let mut progress = self.secondary_state.progress.lock().unwrap();
+        debug_assert!(progress.layers_downloaded == progress.layers_total);
+        debug_assert!(progress.bytes_downloaded == progress.bytes_total);
+        if progress.layers_downloaded != progress.layers_total
+            || progress.bytes_downloaded != progress.bytes_total
+        {
+            tracing::warn!("Correcting drift in progress stats ({progress:?})");
+            progress.layers_downloaded = progress.layers_total;
+            progress.bytes_downloaded = progress.bytes_total;
+        }

        Ok(())
    }
@@ -776,6 +837,7 @@ impl<'a> TenantDownloader<'a> {
    async fn download_timeline(
        &self,
        timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
        ctx: &RequestContext,
    ) -> Result<(), UpdateError> {
        debug_assert_current_span_has_tenant_and_timeline_id();
@@ -784,34 +846,6 @@ impl<'a> TenantDownloader<'a> {
        // Accumulate updates to the state
        let mut touched = Vec::new();

-        // Clone a view of what layers already exist on disk
-        let timeline_state = self
-            .secondary_state
-            .detail
-            .lock()
-            .unwrap()
-            .timelines
-            .get(&timeline.timeline_id)
-            .cloned();
-
-        let timeline_state = match timeline_state {
-            Some(t) => t,
-            None => {
-                // We have no existing state: need to scan local disk for layers first.
-                let timeline_state =
-                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
-
-                // Re-acquire detail lock now that we're done with async load from local FS
-                self.secondary_state
-                    .detail
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(timeline.timeline_id, timeline_state.clone());
-                timeline_state
-            }
-        };
-
        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());

        let mut download_futs = Vec::new();
@@ -1001,6 +1035,14 @@ impl<'a> TenantDownloader<'a> {
                    "Skipped downloading missing layer {}, raced with compaction/gc?",
                    layer.name
                );
+
+                // If the layer is 404, adjust the progress statistics to reflect that we will not download it.
+                let mut progress = self.secondary_state.progress.lock().unwrap();
+                progress.layers_total = progress.layers_total.saturating_sub(1);
+                progress.bytes_total = progress
+                    .bytes_total
+                    .saturating_sub(layer.metadata.file_size);
+
                return Ok(None);
            }
            Err(e) => return Err(e.into()),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -113,12 +113,20 @@ impl From<VectoredValueReconstructState> for ValueReconstructState {
    }
 }

-/// Bag of data accumulated during a vectored get
+/// Bag of data accumulated during a vectored get..
 pub(crate) struct ValuesReconstructState {
+    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
+    /// should not expect to get anything from this hashmap.
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
-
+    /// The keys which are already retrieved
    keys_done: KeySpaceRandomAccum,
+
+    /// The keys covered by the image layers
+    keys_with_image_coverage: Option<Range<Key>>,
+
+    // Statistics that are still accessible as a caller of `get_vectored_impl`.
    layers_visited: u32,
+    delta_layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -126,7 +134,9 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
            layers_visited: 0,
+            delta_layers_visited: 0,
        }
    }

@@ -140,8 +150,17 @@ impl ValuesReconstructState {
        }
    }

-    pub(crate) fn on_layer_visited(&mut self) {
+    pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
        self.layers_visited += 1;
+        if let ReadableLayer::PersistentLayer(layer) = layer {
+            if layer.layer_desc().is_delta() {
+                self.delta_layers_visited += 1;
+            }
+        }
+    }
+
+    pub(crate) fn get_delta_layers_visited(&self) -> u32 {
+        self.delta_layers_visited
    }

    pub(crate) fn get_layers_visited(&self) -> u32 {
@@ -171,6 +190,16 @@ impl ValuesReconstructState {
        }
    }

+    /// On hitting image layer, we can mark all keys in this range as done, because
+    /// if the image layer does not contain a key, it is deleted/never added.
+    pub(crate) fn on_image_layer_visited(&mut self, key_range: &Range<Key>) {
+        let prev_val = self.keys_with_image_coverage.replace(key_range.clone());
+        assert_eq!(
+            prev_val, None,
+            "should consume the keyspace before the next iteration"
+        );
+    }
+
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
@@ -233,8 +262,12 @@ impl ValuesReconstructState {

    /// Returns the key space describing the keys that have
    /// been marked as completed since the last call to this function.
-    pub(crate) fn consume_done_keys(&mut self) -> KeySpace {
-        self.keys_done.consume_keyspace()
+    /// Returns individual keys done, and the image layer coverage.
+    pub(crate) fn consume_done_keys(&mut self) -> (KeySpace, Option<Range<Key>>) {
+        (
+            self.keys_done.consume_keyspace(),
+            self.keys_with_image_coverage.take(),
+        )
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -158,6 +158,7 @@ pub struct ImageLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

+    key_range: Range<Key>,
    lsn: Lsn,

    file: VirtualFile,
@@ -419,6 +420,7 @@ impl ImageLayerInner {
            file,
            file_id,
            max_vectored_read_bytes,
+            key_range: actual_summary.key_range,
        }))
    }

@@ -478,6 +480,8 @@ impl ImageLayerInner {
        self.do_reads_and_update_state(reads, reconstruct_state, ctx)
            .await;

+        reconstruct_state.on_image_layer_visited(&self.key_range);
+
        Ok(())
    }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,10 +18,10 @@ use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
-        AUX_FILES_KEY, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        AUX_FILES_KEY, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
+        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
    },
-    keyspace::{KeySpaceAccum, SparseKeyPartitioning},
+    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, DownloadRemoteLayersTaskInfo,
        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
@@ -60,7 +60,6 @@ use std::{
    ops::ControlFlow,
 };

-use crate::tenant::timeline::init::LocalLayerFileMetadata;
 use crate::{
    aux_file::AuxFileSizeEstimator,
    tenant::{
@@ -89,6 +88,9 @@ use crate::{
    metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{
+    pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::timeline::init::LocalLayerFileMetadata,
+};
 use crate::{
    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
    virtual_file::{MaybeFatalIo, VirtualFile},
@@ -267,6 +269,8 @@ pub struct Timeline {
    // Atomic would be more appropriate here.
    last_freeze_ts: RwLock<Instant>,

+    pub(crate) standby_horizon: AtomicLsn,
+
    // WAL redo manager. `None` only for broken tenants.
    walredo_mgr: Option<Arc<super::WalRedoManager>>,

@@ -346,8 +350,8 @@ pub struct Timeline {
    // though let's keep them both for better error visibility.
    pub initdb_lsn: Lsn,

-    /// When did we last calculate the partitioning?
-    partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// When did we last calculate the partitioning? Make it pub to test cases.
+    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,

    /// Configuration: how often should the partitioning be recalculated.
    repartition_threshold: u64,
@@ -481,6 +485,11 @@ impl GcCutoffs {
    }
 }

+pub(crate) struct TimelineVisitOutcome {
+    completed_keyspace: KeySpace,
+    image_covered_keyspace: KeySpace,
+}
+
 /// An error happened in a get() operation.
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum PageReconstructError {
@@ -505,6 +514,13 @@ pub(crate) enum PageReconstructError {
    MissingKey(MissingKeyError),
 }

+impl GetVectoredError {
+    #[cfg(test)]
+    pub(crate) fn is_missing_key_error(&self) -> bool {
+        matches!(self, Self::MissingKey(_))
+    }
+}
+
 #[derive(Debug)]
 pub struct MissingKeyError {
    key: Key,
@@ -782,6 +798,11 @@ pub(crate) enum ShutdownMode {
    Hard,
 }

+struct ImageLayerCreationOutcome {
+    image: Option<ResidentLayer>,
+    next_start_key: Key,
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -883,7 +904,7 @@ impl Timeline {
                }

                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, reconstruct_state, ctx)
+                    .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                    .await;

                if self.conf.validate_vectored_get {
@@ -1028,7 +1049,12 @@ impl Timeline {
            }
            GetVectoredImpl::Vectored => {
                let vectored_res = self
-                    .get_vectored_impl(keyspace.clone(), lsn, ValuesReconstructState::new(), ctx)
+                    .get_vectored_impl(
+                        keyspace.clone(),
+                        lsn,
+                        &mut ValuesReconstructState::new(),
+                        ctx,
+                    )
                    .await;

                if self.conf.validate_vectored_get {
@@ -1116,7 +1142,7 @@ impl Timeline {
            .get_vectored_impl(
                keyspace.clone(),
                lsn,
-                ValuesReconstructState::default(),
+                &mut ValuesReconstructState::default(),
                ctx,
            )
            .await;
@@ -1193,7 +1219,7 @@ impl Timeline {
        &self,
        keyspace: KeySpace,
        lsn: Lsn,
-        mut reconstruct_state: ValuesReconstructState,
+        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
        let get_kind = if keyspace.total_raw_size() == 1 {
@@ -1205,7 +1231,7 @@ impl Timeline {
        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
            .for_get_kind(get_kind)
            .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
            .await?;
        get_data_timer.stop_and_record();

@@ -1214,7 +1240,8 @@ impl Timeline {
            .start_timer();
        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
        let layers_visited = reconstruct_state.get_layers_visited();
-        for (key, res) in reconstruct_state.keys {
+
+        for (key, res) in std::mem::take(&mut reconstruct_state.keys) {
            match res {
                Err(err) => {
                    results.insert(key, Err(err));
@@ -2254,6 +2281,8 @@ impl Timeline {
                compaction_lock: tokio::sync::Mutex::default(),
                gc_lock: tokio::sync::Mutex::default(),

+                standby_horizon: AtomicLsn::new(0),
+
                timeline_get_throttle: resources.timeline_get_throttle,

                aux_files: tokio::sync::Mutex::new(AuxFilesState {
@@ -3287,12 +3316,15 @@ impl Timeline {

        let mut cont_lsn = Lsn(request_lsn.0 + 1);

-        loop {
+        let missing_keyspace = loop {
            if self.cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
            }

-            let completed = Self::get_vectored_reconstruct_data_timeline(
+            let TimelineVisitOutcome {
+                completed_keyspace: completed,
+                image_covered_keyspace,
+            } = Self::get_vectored_reconstruct_data_timeline(
                timeline,
                keyspace.clone(),
                cont_lsn,
@@ -3311,12 +3343,31 @@ impl Timeline {
                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
            });

-            // Keyspace is fully retrieved, no ancestor timeline, or metadata scan (where we do not look
-            // into ancestor timelines). TODO: is there any other metadata which we want to inherit?
-            if keyspace.total_raw_size() == 0 || timeline.ancestor_timeline.is_none() {
-                break;
+            // Keyspace is fully retrieved
+            if keyspace.is_empty() {
+                break None;
            }

+            // Not fully retrieved but no ancestor timeline.
+            if timeline.ancestor_timeline.is_none() {
+                break Some(keyspace);
+            }
+
+            // Now we see if there are keys covered by the image layer but does not exist in the
+            // image layer, which means that the key does not exist.
+
+            // The block below will stop the vectored search if any of the keys encountered an image layer
+            // which did not contain a snapshot for said key. Since we have already removed all completed
+            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
+            // space. If that's not the case, we had at least one key encounter a gap in the image layer
+            // and stop the search as a result of that.
+            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            if !removed.is_empty() {
+                break Some(removed);
+            }
+            // If we reached this point, `remove_overlapping_with` should not have made any change to the
+            // keyspace.
+
            // Take the min to avoid reconstructing a page with data newer than request Lsn.
            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
            timeline_owned = timeline
@@ -3324,14 +3375,14 @@ impl Timeline {
                .await
                .map_err(GetVectoredError::GetReadyAncestorError)?;
            timeline = &*timeline_owned;
-        }
+        };

-        if keyspace.total_raw_size() != 0 {
+        if let Some(missing_keyspace) = missing_keyspace {
            return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: keyspace.start().unwrap(), /* better if we can store the full keyspace */
+                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
                shard: self
                    .shard_identity
-                    .get_shard_number(&keyspace.start().unwrap()),
+                    .get_shard_number(&missing_keyspace.start().unwrap()),
                cont_lsn,
                request_lsn,
                ancestor_lsn: Some(timeline.ancestor_lsn),
@@ -3356,6 +3407,9 @@ impl Timeline {
    ///
    /// At each iteration pop the top of the fringe (the layer with the highest Lsn)
    /// and get all the required reconstruct data from the layer in one go.
+    ///
+    /// Returns the completed keyspace and the keyspaces with image coverage. The caller
+    /// decides how to deal with these two keyspaces.
    async fn get_vectored_reconstruct_data_timeline(
        timeline: &Timeline,
        keyspace: KeySpace,
@@ -3363,20 +3417,27 @@ impl Timeline {
        reconstruct_state: &mut ValuesReconstructState,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> Result<KeySpace, GetVectoredError> {
+    ) -> Result<TimelineVisitOutcome, GetVectoredError> {
        let mut unmapped_keyspace = keyspace.clone();
        let mut fringe = LayerFringe::new();

        let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();

        loop {
            if cancel.is_cancelled() {
                return Err(GetVectoredError::Cancelled);
            }

-            let keys_done_last_step = reconstruct_state.consume_done_keys();
+            let (keys_done_last_step, keys_with_image_coverage) =
+                reconstruct_state.consume_done_keys();
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);
+            if let Some(keys_with_image_coverage) = keys_with_image_coverage {
+                unmapped_keyspace
+                    .remove_overlapping_with(&KeySpace::single(keys_with_image_coverage.clone()));
+                image_covered_keyspace.add_range(keys_with_image_coverage);
+            }

            // Do not descent any further if the last layer we visited
            // completed all keys in the keyspace it inspected. This is not
@@ -3448,13 +3509,16 @@ impl Timeline {
                unmapped_keyspace = keyspace_to_read;
                cont_lsn = next_cont_lsn;

-                reconstruct_state.on_layer_visited();
+                reconstruct_state.on_layer_visited(&layer_to_read);
            } else {
                break;
            }
        }

-        Ok(completed_keyspace)
+        Ok(TimelineVisitOutcome {
+            completed_keyspace,
+            image_covered_keyspace: image_covered_keyspace.consume_keyspace(),
+        })
    }

    /// # Cancel-safety
@@ -4134,6 +4198,176 @@ impl Timeline {
        false
    }

+    /// Create image layers for Postgres data. Assumes the caller passes a partition that is not too large,
+    /// so that at most one image layer will be produced from this function.
+    async fn create_image_layer_for_rel_blocks(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        start: Key,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        let mut wrote_keys = false;
+
+        let mut key_request_accum = KeySpaceAccum::new();
+        for range in &partition.ranges {
+            let mut key = range.start;
+            while key < range.end {
+                // Decide whether to retain this key: usually we do, but sharded tenants may
+                // need to drop keys that don't belong to them.  If we retain the key, add it
+                // to `key_request_accum` for later issuing a vectored get
+                if self.shard_identity.is_key_disposable(&key) {
+                    debug!(
+                        "Dropping key {} during compaction (it belongs on shard {:?})",
+                        key,
+                        self.shard_identity.get_shard_number(&key)
+                    );
+                } else {
+                    key_request_accum.add_key(key);
+                }
+
+                let last_key_in_range = key.next() == range.end;
+                key = key.next();
+
+                // Maybe flush `key_rest_accum`
+                if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
+                    || (last_key_in_range && key_request_accum.raw_size() > 0)
+                {
+                    let results = self
+                        .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
+                        .await?;
+
+                    for (img_key, img) in results {
+                        let img = match img {
+                            Ok(img) => img,
+                            Err(err) => {
+                                // If we fail to reconstruct a VM or FSM page, we can zero the
+                                // page without losing any actual user data. That seems better
+                                // than failing repeatedly and getting stuck.
+                                //
+                                // We had a bug at one point, where we truncated the FSM and VM
+                                // in the pageserver, but the Postgres didn't know about that
+                                // and continued to generate incremental WAL records for pages
+                                // that didn't exist in the pageserver. Trying to replay those
+                                // WAL records failed to find the previous image of the page.
+                                // This special case allows us to recover from that situation.
+                                // See https://github.com/neondatabase/neon/issues/2601.
+                                //
+                                // Unfortunately we cannot do this for the main fork, or for
+                                // any metadata keys, keys, as that would lead to actual data
+                                // loss.
+                                if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key) {
+                                    warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                    ZERO_PAGE.clone()
+                                } else {
+                                    return Err(CreateImageLayersError::PageReconstructError(err));
+                                }
+                            }
+                        };
+
+                        // Write all the keys we just read into our new image layer.
+                        image_layer_writer.put_image(img_key, img, ctx).await?;
+                        wrote_keys = true;
+                    }
+                }
+            }
+        }
+
+        if wrote_keys {
+            // Normal path: we have written some data into the new image layer for this
+            // partition, so flush it to disk.
+            let image_layer = image_layer_writer.finish(self, ctx).await?;
+            Ok(ImageLayerCreationOutcome {
+                image: Some(image_layer),
+                next_start_key: img_range.end,
+            })
+        } else {
+            // Special case: the image layer may be empty if this is a sharded tenant and the
+            // partition does not cover any keys owned by this shard.  In this case, to ensure
+            // we don't leave gaps between image layers, leave `start` where it is, so that the next
+            // layer we write will cover the key range that we just scanned.
+            tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+            Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: start,
+            })
+        }
+    }
+
+    /// Create an image layer for metadata keys. This function produces one image layer for all metadata
+    /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it
+    /// would not be too large to fit in a single image layer.
+    #[allow(clippy::too_many_arguments)]
+    async fn create_image_layer_for_metadata_keys(
+        self: &Arc<Self>,
+        partition: &KeySpace,
+        mut image_layer_writer: ImageLayerWriter,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        img_range: Range<Key>,
+        mode: ImageLayerCreationMode,
+    ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
+        assert!(!matches!(mode, ImageLayerCreationMode::Initial));
+
+        // Metadata keys image layer creation.
+        let mut reconstruct_state = ValuesReconstructState::default();
+        let data = self
+            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .await?;
+        let (data, total_kb_retrieved, total_key_retrieved) = {
+            let mut new_data = BTreeMap::new();
+            let mut total_kb_retrieved = 0;
+            let mut total_key_retrieved = 0;
+            for (k, v) in data {
+                let v = v.map_err(CreateImageLayersError::PageReconstructError)?;
+                total_kb_retrieved += KEY_SIZE + v.len();
+                total_key_retrieved += 1;
+                new_data.insert(k, v);
+            }
+            (new_data, total_kb_retrieved / 1024, total_key_retrieved)
+        };
+        let delta_file_accessed = reconstruct_state.get_delta_layers_visited();
+
+        let trigger_generation = delta_file_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
+        info!(
+            "generate image layers for metadata keys: trigger_generation={trigger_generation}, \
+                delta_file_accessed={delta_file_accessed}, total_kb_retrieved={total_kb_retrieved}, \
+                total_key_retrieved={total_key_retrieved}"
+        );
+        if !trigger_generation && mode == ImageLayerCreationMode::Try {
+            return Ok(ImageLayerCreationOutcome {
+                image: None,
+                next_start_key: img_range.end,
+            });
+        }
+        let has_keys = !data.is_empty();
+        for (k, v) in data {
+            // Even if the value is empty (deleted), we do not delete it for now until we can ensure vectored get
+            // considers this situation properly.
+            // if v.is_empty() {
+            //     continue;
+            // }
+
+            // No need to handle sharding b/c metadata keys are always on the 0-th shard.
+
+            // TODO: split image layers to avoid too large layer files. Too large image files are not handled
+            // on the normal data path either.
+            image_layer_writer.put_image(k, v, ctx).await?;
+        }
+        Ok(ImageLayerCreationOutcome {
+            image: if has_keys {
+                let image_layer = image_layer_writer.finish(self, ctx).await?;
+                Some(image_layer)
+            } else {
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                None
+            },
+            next_start_key: img_range.end,
+        })
+    }
+
    #[tracing::instrument(skip_all, fields(%lsn, %mode))]
    async fn create_image_layers(
        self: &Arc<Timeline>,
@@ -4175,19 +4409,17 @@ impl Timeline {

        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
-
-            if partition.overlaps(&Key::metadata_key_range()) {
-                // TODO(chi): The next patch will correctly create image layers for metadata keys, and it would be a
-                // rather big change. Keep this patch small for now.
-                match mode {
-                    ImageLayerCreationMode::Force | ImageLayerCreationMode::Try => {
-                        // skip image layer creation anyways for metadata keys.
-                        start = img_range.end;
-                        continue;
-                    }
-                    ImageLayerCreationMode::Initial => {
-                        return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
-                    }
+            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
+            if compact_metadata {
+                for range in &partition.ranges {
+                    assert!(
+                        range.start.field1 >= METADATA_KEY_BEGIN_PREFIX
+                            && range.end.field1 <= METADATA_KEY_END_PREFIX,
+                        "metadata keys must be partitioned separately"
+                    );
+                }
+                if mode == ImageLayerCreationMode::Initial {
+                    return Err(CreateImageLayersError::Other(anyhow::anyhow!("no image layer should be created for metadata keys when flushing frozen layers")));
                }
            } else if let ImageLayerCreationMode::Try = mode {
                // check_for_image_layers = false -> skip
@@ -4198,7 +4430,7 @@ impl Timeline {
                }
            }

-            let mut image_layer_writer = ImageLayerWriter::new(
+            let image_layer_writer = ImageLayerWriter::new(
                self.conf,
                self.timeline_id,
                self.tenant_shard_id,
@@ -4214,87 +4446,39 @@ impl Timeline {
                )))
            });

-            let mut wrote_keys = false;
+            if !compact_metadata {
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_rel_blocks(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        start,
+                    )
+                    .await?;

-            let mut key_request_accum = KeySpaceAccum::new();
-            for range in &partition.ranges {
-                let mut key = range.start;
-                while key < range.end {
-                    // Decide whether to retain this key: usually we do, but sharded tenants may
-                    // need to drop keys that don't belong to them.  If we retain the key, add it
-                    // to `key_request_accum` for later issuing a vectored get
-                    if self.shard_identity.is_key_disposable(&key) {
-                        debug!(
-                            "Dropping key {} during compaction (it belongs on shard {:?})",
-                            key,
-                            self.shard_identity.get_shard_number(&key)
-                        );
-                    } else {
-                        key_request_accum.add_key(key);
-                    }
-
-                    let last_key_in_range = key.next() == range.end;
-                    key = key.next();
-
-                    // Maybe flush `key_rest_accum`
-                    if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
-                        || (last_key_in_range && key_request_accum.raw_size() > 0)
-                    {
-                        let results = self
-                            .get_vectored(key_request_accum.consume_keyspace(), lsn, ctx)
-                            .await?;
-
-                        for (img_key, img) in results {
-                            let img = match img {
-                                Ok(img) => img,
-                                Err(err) => {
-                                    // If we fail to reconstruct a VM or FSM page, we can zero the
-                                    // page without losing any actual user data. That seems better
-                                    // than failing repeatedly and getting stuck.
-                                    //
-                                    // We had a bug at one point, where we truncated the FSM and VM
-                                    // in the pageserver, but the Postgres didn't know about that
-                                    // and continued to generate incremental WAL records for pages
-                                    // that didn't exist in the pageserver. Trying to replay those
-                                    // WAL records failed to find the previous image of the page.
-                                    // This special case allows us to recover from that situation.
-                                    // See https://github.com/neondatabase/neon/issues/2601.
-                                    //
-                                    // Unfortunately we cannot do this for the main fork, or for
-                                    // any metadata keys, keys, as that would lead to actual data
-                                    // loss.
-                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
-                                    {
-                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                        ZERO_PAGE.clone()
-                                    } else {
-                                        return Err(CreateImageLayersError::PageReconstructError(
-                                            err,
-                                        ));
-                                    }
-                                }
-                            };
-
-                            // Write all the keys we just read into our new image layer.
-                            image_layer_writer.put_image(img_key, img, ctx).await?;
-                            wrote_keys = true;
-                        }
-                    }
-                }
-            }
-
-            if wrote_keys {
-                // Normal path: we have written some data into the new image layer for this
-                // partition, so flush it to disk.
-                start = img_range.end;
-                let image_layer = image_layer_writer.finish(self, ctx).await?;
-                image_layers.push(image_layer);
+                start = next_start_key;
+                image_layers.extend(image);
            } else {
-                // Special case: the image layer may be empty if this is a sharded tenant and the
-                // partition does not cover any keys owned by this shard.  In this case, to ensure
-                // we don't leave gaps between image layers, leave `start` where it is, so that the next
-                // layer we write will cover the key range that we just scanned.
-                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
+                let ImageLayerCreationOutcome {
+                    image,
+                    next_start_key,
+                } = self
+                    .create_image_layer_for_metadata_keys(
+                        partition,
+                        image_layer_writer,
+                        lsn,
+                        ctx,
+                        img_range,
+                        mode,
+                    )
+                    .await?;
+                start = next_start_key;
+                image_layers.extend(image);
            }
        }

@@ -4664,7 +4848,32 @@ impl Timeline {
            (horizon_cutoff, pitr_cutoff, retain_lsns)
        };

-        let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
+        let standby_horizon = self.standby_horizon.load();
+        // Hold GC for the standby, but as a safety guard do it only within some
+        // reasonable lag.
+        if standby_horizon != Lsn::INVALID {
+            if let Some(standby_lag) = new_gc_cutoff.checked_sub(standby_horizon) {
+                const MAX_ALLOWED_STANDBY_LAG: u64 = 10u64 << 30; // 10 GB
+                if standby_lag.0 < MAX_ALLOWED_STANDBY_LAG {
+                    new_gc_cutoff = Lsn::min(standby_horizon, new_gc_cutoff);
+                    trace!("holding off GC for standby apply LSN {}", standby_horizon);
+                } else {
+                    warn!(
+                        "standby is lagging for more than {}MB, not holding gc for it",
+                        MAX_ALLOWED_STANDBY_LAG / 1024 / 1024
+                    )
+                }
+            }
+        }
+
+        // Reset standby horizon to ignore it if it is not updated till next GC.
+        // It is an easy way to unset it when standby disappears without adding
+        // more conf options.
+        self.standby_horizon.store(Lsn::INVALID);
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);

        let res = self
            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -116,9 +116,13 @@ impl Timeline {

                // 3. Create new image layers for partitions that have been modified
                // "enough".
-                let dense_layers = self
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);
+                let image_layers = self
                    .create_image_layers(
-                        &dense_partitioning,
+                        &partitioning,
                        lsn,
                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
                            ImageLayerCreationMode::Force
@@ -130,24 +134,8 @@ impl Timeline {
                    .await
                    .map_err(anyhow::Error::from)?;

-                // For now, nothing will be produced...
-                let sparse_layers = self
-                    .create_image_layers(
-                        &sparse_partitioning.clone().into_dense(),
-                        lsn,
-                        if flags.contains(CompactFlags::ForceImageLayerCreation) {
-                            ImageLayerCreationMode::Force
-                        } else {
-                            ImageLayerCreationMode::Try
-                        },
-                        &image_ctx,
-                    )
-                    .await
-                    .map_err(anyhow::Error::from)?;
-                assert!(sparse_layers.is_empty());
-
-                self.upload_new_image_layers(dense_layers)?;
-                dense_partitioning.parts.len()
+                self.upload_new_image_layers(image_layers)?;
+                partitioning.parts.len()
            }
            Err(err) => {
                // no partitioning? This is normal, if the timeline was just created
@@ -499,8 +487,11 @@ impl Timeline {

        for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
            if let Some(prev_key) = prev {
-                // just first fast filter
-                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
+                // just first fast filter, do not create hole entries for metadata keys. The last hole in the
+                // compaction is the gap between data key and metadata keys.
+                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
+                    && !Key::is_metadata_key(&prev_key)
+                {
                    let key_range = prev_key..next_key;
                    // Measuring hole by just subtraction of i128 representation of key range boundaries
                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -705,6 +705,7 @@ impl ConnectionManagerState {
                    commit_lsn: info.commit_lsn,
                    safekeeper_connstr: info.safekeeper_connstr,
                    availability_zone: info.availability_zone,
+                    standby_horizon: info.standby_horizon,
                }
            }
            MessageType::SafekeeperDiscoveryResponse => {
@@ -725,6 +726,21 @@ impl ConnectionManagerState {

        WALRECEIVER_BROKER_UPDATES.inc();

+        trace!(
+            "safekeeper info update: standby_horizon(cutoff)={}",
+            timeline_update.standby_horizon
+        );
+        if timeline_update.standby_horizon != 0 {
+            // ignore reports from safekeepers not connected to replicas
+            self.timeline
+                .standby_horizon
+                .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
+        }
+
        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
        let old_entry = self.wal_stream_candidates.insert(
            new_safekeeper_id,
@@ -1094,6 +1110,7 @@ mod tests {
                commit_lsn,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
+                standby_horizon: 0,
            },
            latest_update,
        }
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -0,0 +1,78 @@
+From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Fri, 2 Feb 2024 22:26:45 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..ec54dea 100644
+--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
+@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
+ 
+ 	hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Perform inserts */
+ 	HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
+ 	/* Close relations within worker */
+ 	index_close(indexRel, indexLockmode);
+ 	table_close(heapRel, heapLockmode);
+@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
+#ifdef NEON_SMGR
+	smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -45,6 +45,7 @@
 */
 #include "postgres.h"

+#include "access/parallel.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlogdefs.h"
@@ -2822,10 +2823,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;

 	/*
+	 * Create the local file. In a parallel build, the leader is expected to
+	 * call this first and do it.
+	 *
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-	mdcreate(reln, MAIN_FORKNUM, false);
+	if (!IsParallelWorker())
+		mdcreate(reln, MAIN_FORKNUM, false);
 }

 /*
@@ -2849,7 +2854,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
 	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

-	unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
+	/*
+	 * In a parallel build, (only) the leader process performs the 2nd
+	 * phase.
+	 */
+	if (IsParallelWorker())
+	{
+		unlogged_build_rel = NULL;
+		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
+	}
+	else
+		unlogged_build_phase = UNLOGGED_BUILD_PHASE_2;
 }

 /*
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1852,34 +1852,30 @@ static void
 CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
 {
 	hs->ts = 0;
-	hs->xmin.value = ~0;		/* largest unsigned value */
-	hs->catalog_xmin.value = ~0;	/* largest unsigned value */
+	hs->xmin = InvalidFullTransactionId;
+	hs->catalog_xmin = InvalidFullTransactionId;

 	for (int i = 0; i < wp->n_safekeepers; i++)
 	{
-		if (wp->safekeeper[i].appendResponse.hs.ts != 0)
+
+		if (wp->safekeeper[i].state == SS_ACTIVE)
 		{
 			HotStandbyFeedback *skhs = &wp->safekeeper[i].appendResponse.hs;

 			if (FullTransactionIdIsNormal(skhs->xmin)
-				&& FullTransactionIdPrecedes(skhs->xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->xmin) || FullTransactionIdPrecedes(skhs->xmin, hs->xmin)))
 			{
 				hs->xmin = skhs->xmin;
 				hs->ts = skhs->ts;
 			}
 			if (FullTransactionIdIsNormal(skhs->catalog_xmin)
-				&& FullTransactionIdPrecedes(skhs->catalog_xmin, hs->xmin))
+				&& (!FullTransactionIdIsValid(hs->catalog_xmin) || FullTransactionIdPrecedes(skhs->catalog_xmin, hs->catalog_xmin)))
 			{
 				hs->catalog_xmin = skhs->catalog_xmin;
 				hs->ts = skhs->ts;
 			}
 		}
 	}
-
-	if (hs->xmin.value == ~0)
-		hs->xmin = InvalidFullTransactionId;
-	if (hs->catalog_xmin.value == ~0)
-		hs->catalog_xmin = InvalidFullTransactionId;
 }

 /*
@@ -1946,14 +1942,28 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	}

 	CombineHotStanbyFeedbacks(&hsFeedback, wp);
-	if (hsFeedback.ts != 0 && memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
+	if (memcmp(&hsFeedback, &agg_hs_feedback, sizeof hsFeedback) != 0)
 	{
+		FullTransactionId xmin = hsFeedback.xmin;
+		FullTransactionId catalog_xmin = hsFeedback.catalog_xmin;
+		FullTransactionId next_xid = ReadNextFullTransactionId();
+		/*
+		 * Page server is updating nextXid in checkpoint each 1024 transactions,
+		 * so feedback xmin can be actually larger then nextXid and
+		 * function TransactionIdInRecentPast return false in this case,
+		 * preventing update of slot's xmin.
+		 */
+		if (FullTransactionIdPrecedes(next_xid, xmin))
+			xmin = next_xid;
+		if (FullTransactionIdPrecedes(next_xid, catalog_xmin))
+			catalog_xmin = next_xid;
 		agg_hs_feedback = hsFeedback;
+		elog(DEBUG2, "ProcessStandbyHSFeedback(xmin=%d, catalog_xmin=%d", XidFromFullTransactionId(hsFeedback.xmin), XidFromFullTransactionId(hsFeedback.catalog_xmin));
 		ProcessStandbyHSFeedback(hsFeedback.ts,
-								 XidFromFullTransactionId(hsFeedback.xmin),
-								 EpochFromFullTransactionId(hsFeedback.xmin),
-								 XidFromFullTransactionId(hsFeedback.catalog_xmin),
-								 EpochFromFullTransactionId(hsFeedback.catalog_xmin));
+								 XidFromFullTransactionId(xmin),
+								 EpochFromFullTransactionId(xmin),
+								 XidFromFullTransactionId(catalog_xmin),
+								 EpochFromFullTransactionId(catalog_xmin));
 	}

 	CheckGracefulShutdown(wp);
--- a/poetry.lock
+++ b/poetry.lock
@@ -2405,6 +2405,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2529,13 +2530,13 @@ files = [

 [[package]]
 name = "requests"
-version = "2.31.0"
+version = "2.32.0"
 description = "Python HTTP for Humans."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
-    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+    {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
+    {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
 ]

 [package.dependencies]
@@ -2959,6 +2960,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3196,4 +3207,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "dcde14c58a32bda5f123319a069352c458b3719f3c62977991eebb9803a46a9e"
+content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -307,7 +307,7 @@ where
 }

 async fn upload_parquet(
-    w: SerializedFileWriter<Writer<BytesMut>>,
+    mut w: SerializedFileWriter<Writer<BytesMut>>,
    len: i64,
    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<Writer<BytesMut>> {
@@ -319,11 +319,15 @@ async fn upload_parquet(

    // I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
    // finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
-    let (writer, metadata) = tokio::task::spawn_blocking(move || w.finish())
+    let (mut buffer, metadata) =
+        tokio::task::spawn_blocking(move || -> parquet::errors::Result<_> {
+            let metadata = w.finish()?;
+            let buffer = std::mem::take(w.inner_mut().get_mut());
+            Ok((buffer, metadata))
+        })
        .await
        .unwrap()?;

-    let mut buffer = writer.into_inner();
    let data = buffer.split().freeze();

    let compression = len as f64 / len_uncompressed as f64;
@@ -474,10 +478,11 @@ mod tests {
        RequestData {
            session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
            peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
-            timestamp: chrono::NaiveDateTime::from_timestamp_millis(
+            timestamp: chrono::DateTime::from_timestamp_millis(
                rng.gen_range(1703862754..1803862754),
            )
-            .unwrap(),
+            .unwrap()
+            .naive_utc(),
            application_name: Some("test".to_owned()),
            username: Some(hex::encode(rng.gen::<[u8; 4]>())),
            endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
@@ -560,15 +565,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -598,11 +603,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1221738, 5, 10000),
-                (1227888, 5, 10000),
-                (1229682, 5, 10000),
-                (1229044, 5, 10000),
-                (1220322, 5, 10000)
+                (1222212, 5, 10000),
+                (1228362, 5, 10000),
+                (1230156, 5, 10000),
+                (1229518, 5, 10000),
+                (1220796, 5, 10000)
            ]
        );

@@ -634,11 +639,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1207385, 5, 10000),
-                (1207116, 5, 10000),
-                (1207409, 5, 10000),
-                (1207397, 5, 10000),
-                (1207652, 5, 10000)
+                (1207859, 5, 10000),
+                (1207590, 5, 10000),
+                (1207883, 5, 10000),
+                (1207871, 5, 10000),
+                (1208126, 5, 10000)
            ]
        );

@@ -663,15 +668,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1315008, 3, 6000),
-                (1315001, 3, 6000),
-                (1315061, 3, 6000),
-                (1315018, 3, 6000),
-                (1315148, 3, 6000),
-                (1314990, 3, 6000),
-                (1314782, 3, 6000),
-                (1315018, 3, 6000),
-                (438575, 1, 2000)
+                (1315314, 3, 6000),
+                (1315307, 3, 6000),
+                (1315367, 3, 6000),
+                (1315324, 3, 6000),
+                (1315454, 3, 6000),
+                (1315296, 3, 6000),
+                (1315088, 3, 6000),
+                (1315324, 3, 6000),
+                (438713, 1, 2000)
            ]
        );

@@ -708,7 +713,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(659240, 2, 3001), (658954, 2, 3000), (658750, 2, 2999)]
+            [(659462, 2, 3001), (659176, 2, 3000), (658972, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
-requests = "^2.31.0"
+requests = "^2.32.0"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -186,6 +186,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                        commit_lsn: sk_info.commit_lsn,
                        safekeeper_connstr: sk_info.safekeeper_connstr,
                        availability_zone: sk_info.availability_zone,
+                        standby_horizon: 0,
                    };

                    // note this is a blocking call
@@ -319,7 +320,7 @@ async fn task_stats(stats: Arc<BrokerStats>) {

                let now = BrokerStats::now_millis();
                if now > last_pulled && now - last_pulled > warn_duration.as_millis() as u64 {
-                    let ts = chrono::NaiveDateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
+                    let ts = chrono::DateTime::from_timestamp_millis(last_pulled as i64).expect("invalid timestamp");
                    info!("no broker updates for some time, last update: {:?}", ts);
                }
            }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -350,6 +350,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
        backup_lsn: sk_info.backup_lsn.0,
        local_start_lsn: sk_info.local_start_lsn.0,
        availability_zone: None,
+        standby_horizon: sk_info.standby_horizon.0,
    };

    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -183,9 +183,19 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
    ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_wal_push_guts(pgb).await {
+        let mut tli: Option<Arc<Timeline>> = None;
+        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
            // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            let handle_end_fut = pgb.handle_copy_stream_end(end);
+            // If we managed to create the timeline, augment logging with current LSNs etc.
+            if let Some(tli) = tli {
+                let info = tli.get_safekeeper_info(&self.conf).await;
+                handle_end_fut
+                    .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.commit_lsn)))
+                    .await;
+            } else {
+                handle_end_fut.await;
+            }
        }
        Ok(())
    }
@@ -193,6 +203,7 @@ impl SafekeeperPostgresHandler {
    pub async fn handle_start_wal_push_guts<IO: AsyncRead + AsyncWrite + Unpin>(
        &mut self,
        pgb: &mut PostgresBackend<IO>,
+        tli: &mut Option<Arc<Timeline>>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        // Notify the libpq client that it's allowed to send `CopyData` messages
        pgb.write_message(&BeMessage::CopyBothResponse).await?;
@@ -222,13 +233,17 @@ impl SafekeeperPostgresHandler {
        // Read first message and create timeline if needed.
        let res = network_reader.read_first_message().await;

-        let res = if let Ok((tli, next_msg)) = res {
+        let network_res = if let Ok((timeline, next_msg)) = res {
            let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
-                tli.get_walreceivers().pageserver_feedback_tx.subscribe();
+                timeline
+                    .get_walreceivers()
+                    .pageserver_feedback_tx
+                    .subscribe();
+            *tli = Some(timeline.clone());

            tokio::select! {
                // todo: add read|write .context to these errors
-                r = network_reader.run(msg_tx, msg_rx, reply_tx, tli.clone(), next_msg) => r,
+                r = network_reader.run(msg_tx, msg_rx, reply_tx, timeline.clone(), next_msg) => r,
                r = network_write(pgb, reply_rx, pageserver_feedback_rx) => r,
            }
        } else {
@@ -244,13 +259,13 @@ impl SafekeeperPostgresHandler {
        match acceptor_handle {
            None => {
                // failed even before spawning; read_network should have error
-                Err(res.expect_err("no error with WalAcceptor not spawn"))
+                Err(network_res.expect_err("no error with WalAcceptor not spawn"))
            }
            Some(handle) => {
                let wal_acceptor_res = handle.await;

                // If there was any network error, return it.
-                res?;
+                network_res?;

                // Otherwise, WalAcceptor thread must have errored.
                match wal_acceptor_res {
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -23,7 +23,7 @@ use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;

-use std::cmp::min;
+use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::str;
 use std::sync::Arc;
@@ -85,8 +85,17 @@ impl StandbyReply {

 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub struct StandbyFeedback {
-    reply: StandbyReply,
-    hs_feedback: HotStandbyFeedback,
+    pub reply: StandbyReply,
+    pub hs_feedback: HotStandbyFeedback,
+}
+
+impl StandbyFeedback {
+    pub fn empty() -> Self {
+        StandbyFeedback {
+            reply: StandbyReply::empty(),
+            hs_feedback: HotStandbyFeedback::empty(),
+        }
+    }
 }

 /// WalSenders registry. Timeline holds it (wrapped in Arc).
@@ -162,8 +171,8 @@ impl WalSenders {
    }

    /// Get aggregated hot standby feedback (we send it to compute).
-    pub fn get_hotstandby(self: &Arc<WalSenders>) -> HotStandbyFeedback {
-        self.mutex.lock().agg_hs_feedback
+    pub fn get_hotstandby(self: &Arc<WalSenders>) -> StandbyFeedback {
+        self.mutex.lock().agg_standby_feedback
    }

    /// Record new pageserver feedback, update aggregated values.
@@ -184,6 +193,10 @@ impl WalSenders {
    fn record_standby_reply(self: &Arc<WalSenders>, id: WalSenderId, reply: &StandbyReply) {
        let mut shared = self.mutex.lock();
        let slot = shared.get_slot_mut(id);
+        debug!(
+            "Record standby reply: ts={} apply_lsn={}",
+            reply.reply_ts, reply.apply_lsn
+        );
        match &mut slot.feedback {
            ReplicationFeedback::Standby(sf) => sf.reply = *reply,
            ReplicationFeedback::Pageserver(_) => {
@@ -208,7 +221,7 @@ impl WalSenders {
                })
            }
        }
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
    }

    /// Get remote_consistent_lsn reported by the pageserver. Returns None if
@@ -226,13 +239,13 @@ impl WalSenders {
    fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
-        shared.update_hs_feedback();
+        shared.update_reply_feedback();
    }
 }

 struct WalSendersShared {
    // aggregated over all walsenders value
-    agg_hs_feedback: HotStandbyFeedback,
+    agg_standby_feedback: StandbyFeedback,
    // last feedback ever received from any pageserver, empty if none
    last_ps_feedback: PageserverFeedback,
    // total counter of pageserver feedbacks received
@@ -243,7 +256,7 @@ struct WalSendersShared {
 impl WalSendersShared {
    fn new() -> Self {
        WalSendersShared {
-            agg_hs_feedback: HotStandbyFeedback::empty(),
+            agg_standby_feedback: StandbyFeedback::empty(),
            last_ps_feedback: PageserverFeedback::empty(),
            ps_feedback_counter: 0,
            slots: Vec::new(),
@@ -260,10 +273,11 @@ impl WalSendersShared {
        self.slots[id].as_mut().expect("walsender doesn't exist")
    }

-    /// Update aggregated hot standy feedback. We just take min of valid xmins
+    /// Update aggregated hot standy and normal reply feedbacks. We just take min of valid xmins
    /// and ts.
-    fn update_hs_feedback(&mut self) {
+    fn update_reply_feedback(&mut self) {
        let mut agg = HotStandbyFeedback::empty();
+        let mut reply_agg = StandbyReply::empty();
        for ws_state in self.slots.iter().flatten() {
            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
                let hs_feedback = standby_feedback.hs_feedback;
@@ -276,7 +290,7 @@ impl WalSendersShared {
                    } else {
                        agg.xmin = hs_feedback.xmin;
                    }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
                }
                if hs_feedback.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
                    if agg.catalog_xmin != INVALID_FULL_TRANSACTION_ID {
@@ -284,11 +298,43 @@ impl WalSendersShared {
                    } else {
                        agg.catalog_xmin = hs_feedback.catalog_xmin;
                    }
-                    agg.ts = min(agg.ts, hs_feedback.ts);
+                    agg.ts = max(agg.ts, hs_feedback.ts);
+                }
+                let reply = standby_feedback.reply;
+                if reply.write_lsn != Lsn::INVALID {
+                    if reply_agg.write_lsn != Lsn::INVALID {
+                        reply_agg.write_lsn = Lsn::min(reply_agg.write_lsn, reply.write_lsn);
+                    } else {
+                        reply_agg.write_lsn = reply.write_lsn;
+                    }
+                }
+                if reply.flush_lsn != Lsn::INVALID {
+                    if reply_agg.flush_lsn != Lsn::INVALID {
+                        reply_agg.flush_lsn = Lsn::min(reply_agg.flush_lsn, reply.flush_lsn);
+                    } else {
+                        reply_agg.flush_lsn = reply.flush_lsn;
+                    }
+                }
+                if reply.apply_lsn != Lsn::INVALID {
+                    if reply_agg.apply_lsn != Lsn::INVALID {
+                        reply_agg.apply_lsn = Lsn::min(reply_agg.apply_lsn, reply.apply_lsn);
+                    } else {
+                        reply_agg.apply_lsn = reply.apply_lsn;
+                    }
+                }
+                if reply.reply_ts != 0 {
+                    if reply_agg.reply_ts != 0 {
+                        reply_agg.reply_ts = TimestampTz::min(reply_agg.reply_ts, reply.reply_ts);
+                    } else {
+                        reply_agg.reply_ts = reply.reply_ts;
+                    }
                }
            }
        }
-        self.agg_hs_feedback = agg;
+        self.agg_standby_feedback = StandbyFeedback {
+            reply: reply_agg,
+            hs_feedback: agg,
+        };
    }
 }

@@ -340,12 +386,16 @@ impl SafekeeperPostgresHandler {
        start_pos: Lsn,
        term: Option<Term>,
    ) -> Result<(), QueryError> {
+        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
        if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term)
+            .handle_start_replication_guts(pgb, start_pos, term, tli.clone())
            .await
        {
+            let info = tli.get_safekeeper_info(&self.conf).await;
            // Log the result and probably send it to the client, closing the stream.
-            pgb.handle_copy_stream_end(end).await;
+            pgb.handle_copy_stream_end(end)
+            .instrument(info_span!("", term=%info.term, last_log_term=%info.last_log_term, flush_lsn=%Lsn(info.flush_lsn), commit_lsn=%Lsn(info.flush_lsn)))
+            .await;
        }
        Ok(())
    }
@@ -355,10 +405,9 @@ impl SafekeeperPostgresHandler {
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
        term: Option<Term>,
+        tli: Arc<Timeline>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();
-        let tli =
-            GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;

        // Use a guard object to remove our entry from the timeline when we are done.
        let ws_guard = Arc::new(tli.get_walsenders().register(
@@ -707,8 +756,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
        match msg.first().cloned() {
            Some(HOT_STANDBY_FEEDBACK_TAG_BYTE) => {
                // Note: deserializing is on m[1..] because we skip the tag byte.
-                let hs_feedback = HotStandbyFeedback::des(&msg[1..])
+                let mut hs_feedback = HotStandbyFeedback::des(&msg[1..])
                    .context("failed to deserialize HotStandbyFeedback")?;
+                // TODO: xmin/catalog_xmin are serialized by walreceiver.c in this way:
+                // pq_sendint32(&reply_message, xmin);
+                // pq_sendint32(&reply_message, xmin_epoch);
+                // So it is two big endian 32-bit words in low endian order!
+                hs_feedback.xmin = (hs_feedback.xmin >> 32) | (hs_feedback.xmin << 32);
+                hs_feedback.catalog_xmin =
+                    (hs_feedback.catalog_xmin >> 32) | (hs_feedback.catalog_xmin << 32);
                self.ws_guard
                    .walsenders
                    .record_hs_feedback(self.ws_guard.id, &hs_feedback);
@@ -790,8 +846,11 @@ mod tests {
    fn test_hs_feedback_no_valid() {
        let mut wss = WalSendersShared::new();
        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, INVALID_FULL_TRANSACTION_ID);
+        wss.update_reply_feedback();
+        assert_eq!(
+            wss.agg_standby_feedback.hs_feedback.xmin,
+            INVALID_FULL_TRANSACTION_ID
+        );
    }

    #[test]
@@ -800,7 +859,7 @@ mod tests {
        push_feedback(&mut wss, hs_feedback(1, INVALID_FULL_TRANSACTION_ID));
        push_feedback(&mut wss, hs_feedback(1, 42));
        push_feedback(&mut wss, hs_feedback(1, 64));
-        wss.update_hs_feedback();
-        assert_eq!(wss.agg_hs_feedback.xmin, 42);
+        wss.update_reply_feedback();
+        assert_eq!(wss.agg_standby_feedback.hs_feedback.xmin, 42);
    }
 }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -248,6 +248,7 @@ impl SharedState {
        &self,
        ttid: &TenantTimelineId,
        conf: &SafeKeeperConf,
+        standby_apply_lsn: Lsn,
    ) -> SafekeeperTimelineInfo {
        SafekeeperTimelineInfo {
            safekeeper_id: conf.my_id.0,
@@ -270,6 +271,7 @@ impl SharedState {
            backup_lsn: self.sk.state.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
+            standby_horizon: standby_apply_lsn.0,
        }
    }

@@ -663,7 +665,7 @@ impl Timeline {

            // if this is AppendResponse, fill in proper hot standby feedback.
            if let Some(AcceptorProposerMessage::AppendResponse(ref mut resp)) = rmsg {
-                resp.hs_feedback = self.walsenders.get_hotstandby();
+                resp.hs_feedback = self.walsenders.get_hotstandby().hs_feedback;
            }

            commit_lsn = shared_state.sk.state.inmem.commit_lsn;
@@ -716,7 +718,8 @@ impl Timeline {
    /// Get safekeeper info for broadcasting to broker and other peers.
    pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
        let shared_state = self.write_shared_state().await;
-        shared_state.get_safekeeper_info(&self.ttid, conf)
+        let standby_apply_lsn = self.walsenders.get_hotstandby().reply.apply_lsn;
+        shared_state.get_safekeeper_info(&self.ttid, conf, standby_apply_lsn)
    }

    /// Update timeline state with peer safekeeper data.
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -147,6 +147,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                local_start_lsn: 0,
                availability_zone: None,
+                standby_horizon: 0,
            };
            counter += 1;
            yield info;
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -42,6 +42,7 @@ message SafekeeperTimelineInfo {
    uint64 remote_consistent_lsn = 7;
    uint64 peer_horizon_lsn = 8;
    uint64 local_start_lsn = 9;
+    uint64 standby_horizon = 14;
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
    // HTTP endpoint connection string
@@ -105,4 +106,6 @@ message SafekeeperDiscoveryResponse {
    string safekeeper_connstr = 4;
    // Availability zone of a safekeeper.
    optional string availability_zone = 5;
+    // Replica apply LSN
+    uint64 standby_horizon = 6;
 }
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -736,6 +736,7 @@ mod tests {
            http_connstr: "neon-1-sk-1.local:7677".to_owned(),
            local_start_lsn: 0,
            availability_zone: None,
+            standby_horizon: 0,
        })
    }

--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -142,6 +142,7 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
    "pageserver_resident_physical_size",
    "pageserver_io_operations_bytes_total",
    "pageserver_last_record_lsn",
+    "pageserver_standby_horizon",
    "pageserver_smgr_query_seconds_bucket",
    "pageserver_smgr_query_seconds_count",
    "pageserver_smgr_query_seconds_sum",
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2721,7 +2721,12 @@ class PgBin:
        env.update(env_add)
        return env

-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ):
        """
        Run one of the postgres binaries.

@@ -4145,7 +4150,12 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:


 # pg is the existing and running compute node, that we want to compare with a basebackup
-def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
+def check_restored_datadir_content(
+    test_output_dir: Path,
+    env: NeonEnv,
+    endpoint: Endpoint,
+    ignored_files: Optional[list[str]] = None,
+):
    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)

    # Get the timeline ID. We need it for the 'basebackup' command
@@ -4198,6 +4208,10 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
            if not f.startswith("pg_xact") and not f.startswith("pg_multixact")
        ]

+    if ignored_files:
+        pgdata_files = [f for f in pgdata_files if f not in ignored_files]
+        restored_files = [f for f in restored_files if f not in ignored_files]
+
    # check that file sets are equal
    assert pgdata_files == restored_files

@@ -4288,6 +4302,17 @@ def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):
        time.sleep(1)


+def log_replica_lag(primary: Endpoint, secondary: Endpoint):
+    last_replay_lsn = Lsn(
+        secondary.safe_psql_scalar("SELECT pg_last_wal_replay_lsn()", log_query=False)
+    )
+    primary_lsn = Lsn(
+        primary.safe_psql_scalar("SELECT pg_current_wal_flush_lsn()", log_query=False)
+    )
+    lag = primary_lsn - last_replay_lsn
+    log.info(f"primary_lsn={primary_lsn}, replay_lsn={last_replay_lsn}, lag={lag}")
+
+
 def wait_for_last_flush_lsn(
    env: NeonEnv,
    endpoint: Endpoint,
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,20 @@
 import os
 import re
+import threading
 import time
+from functools import partial

+import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, tenant_get_shards, wait_replica_caughtup
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    log_replica_lag,
+    tenant_get_shards,
+    wait_replica_caughtup,
+)
+from fixtures.utils import wait_until


 # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -104,19 +115,28 @@ def test_2_replicas_start(neon_simple_env: NeonEnv):
                wait_replica_caughtup(primary, secondary2)


-# We had an issue that a standby server made GetPage requests with an
-# old LSN, based on the last-written LSN cache, to avoid waits in the
-# pageserver.  However, requesting a page with a very old LSN, such
-# that the GC horizon has already advanced past it, results in an
-# error from the pageserver:
-# "Bad request: tried to request a page version that was garbage collected"
+# Test two different scenarios related to gc of data needed by hot standby.
 #
-# To avoid that, the compute<-> pageserver protocol was updated so
-# that that the standby now sends two LSNs, the old last-written LSN
-# and the current replay LSN.
+# When pause_apply is False, standby is mostly caught up with the primary.
+# However, in compute <-> pageserver protocol version 1 only one LSN had been
+# sent to the pageserver in page request, and to avoid waits in the pageserver
+# it was last-written LSN cache value. If page hasn't been updated for a long
+# time that resulted in an error from the pageserver: "Bad request: tried to
+# request a page version that was garbage collected". For primary this wasn't a
+# problem because pageserver always bumped LSN to the newest one; for standy
+# that would be incorrect since we might get page fresher then apply LSN. Hence,
+# in protocol version v2 two LSNs were introduced: main request_lsn (apply LSN
+# in case of standby) and not_modified_since which could be used as an
+# optimization to avoid waiting.
 #
 # https://github.com/neondatabase/neon/issues/6211
-def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
+#
+# When pause_apply is True we model standby lagging behind primary (e.g. due to
+# high max_standby_streaming_delay). To prevent pageserver from removing data
+# still needed by the standby apply LSN is propagated in standby -> safekeepers
+# -> broker -> pageserver flow so that pageserver could hold off gc for it.
+@pytest.mark.parametrize("pause_apply", [False, True])
+def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
    tenant_conf = {
        # set PITR interval to be small, so we can do GC
        "pitr_interval": "0 s",
@@ -160,6 +180,9 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
            # so we still remember the LSNs of the pages.
            s_cur.execute("SELECT clear_buffer_cache()")

+            if pause_apply:
+                s_cur.execute("SELECT pg_wal_replay_pause()")
+
            # Do other stuff on the primary, to advance the WAL
            p_cur.execute("CREATE TABLE test2 AS SELECT generate_series(1, 1000000) AS g")

@@ -176,6 +199,96 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder):
            # generates use old not_modified_since LSNs, older than
            # the GC cutoff, but new request LSNs. (In protocol
            # version 1 there was only one LSN, and this failed.)
+            log_replica_lag(primary, secondary)
            s_cur.execute("SELECT COUNT(*) FROM test")
+            log_replica_lag(primary, secondary)
            res = s_cur.fetchone()
            assert res[0] == 10000
+
+
+def run_pgbench(connstr: str, pg_bin: PgBin):
+    log.info(f"Start a pgbench workload on pg {connstr}")
+    # s10 is about 150MB of data. In debug mode init takes about 15s on SSD.
+    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
+    log.info("pgbench init done")
+    pg_bin.run_capture(["pgbench", "-T60", connstr])
+
+
+# assert that pgbench_accounts and its index are created.
+def pgbench_accounts_initialized(ep):
+    ep.safe_psql_scalar("select 'pgbench_accounts_pkey'::regclass")
+
+
+# Test that hot_standby_feedback works in neon (it is forwarded through
+# safekeepers). That is, ensure queries on standby don't fail during load on
+# primary under the following conditions:
+# - pgbench bombards primary with updates.
+# - On the secondary we run long select of the updated table.
+# - Set small max_standby_streaming_delay: hs feedback should prevent conflicts
+#   so apply doesn't need to wait.
+# - Do agressive vacuum on primary which still shouldn't create conflicts.
+#   Actually this appears to be redundant due to microvacuum existence.
+#
+# Without hs feedback enabled we'd see 'User query might have needed to see row
+# versions that must be removed.' errors.
+def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    env = neon_env_builder.init_start()
+    agressive_vacuum_conf = [
+        "log_autovacuum_min_duration = 0",
+        "autovacuum_naptime = 10s",
+        "autovacuum_vacuum_threshold = 25",
+        "autovacuum_vacuum_scale_factor = 0.1",
+        "autovacuum_vacuum_cost_delay = -1",
+    ]
+    with env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=agressive_vacuum_conf
+    ) as primary:
+        # It would be great to have more strict max_standby_streaming_delay=0s here, but then sometimes it fails with
+        # 'User was holding shared buffer pin for too long.'.
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=[
+                "max_standby_streaming_delay=2s",
+                "neon.protocol_version=2",
+                "hot_standby_feedback=true",
+            ],
+        ) as secondary:
+            log.info(
+                f"primary connstr is {primary.connstr()}, secondary connstr {secondary.connstr()}"
+            )
+            t = threading.Thread(target=run_pgbench, args=(primary.connstr(), pg_bin))
+            t.start()
+            # Wait until pgbench_accounts is created + filled on replica *and*
+            # index is created. Otherwise index creation would conflict with
+            # read queries and hs feedback won't save us.
+            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+
+            # Test should fail if hs feedback is disabled anyway, but cross
+            # check that walproposer sets some xmin.
+            def xmin_is_not_null():
+                slot_xmin = primary.safe_psql_scalar(
+                    "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                    log_query=False,
+                )
+                log.info(f"xmin is {slot_xmin}")
+                assert int(slot_xmin) > 0
+
+            wait_until(10, 1.0, xmin_is_not_null)
+            for _ in range(1, 5):
+                # in debug mode takes about 5-7s
+                balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
+                log.info(f"balance={balance}")
+                log_replica_lag(primary, secondary)
+            t.join()
+
+        # check xmin is reset when standby is gone
+        def xmin_is_null():
+            slot_xmin = primary.safe_psql_scalar(
+                "select xmin from pg_replication_slots where slot_name = 'wal_proposer_slot'",
+                log_query=False,
+            )
+            log.info(f"xmin is {slot_xmin}")
+            assert slot_xmin is None
+
+        wait_until(10, 1.0, xmin_is_null)
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -575,7 +575,10 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
    tenant_timelines = {}

    # This mirrors a constant in `downloader.rs`
-    freshen_interval_secs = 60
+    default_download_period_secs = 60
+
+    # The upload period, which will also be the download once the secondary has seen its first heatmap
+    upload_period_secs = 20

    for _i in range(0, tenant_count):
        tenant_id = TenantId.generate()
@@ -587,7 +590,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
            placement_policy='{"Attached":1}',
            # Run with a low heatmap period so that we can avoid having to do synthetic API calls
            # to trigger the upload promptly.
-            conf={"heatmap_period": "1s"},
+            conf={"heatmap_period": f"{upload_period_secs}s"},
        )
        env.neon_cli.create_timeline("main2", tenant_id, timeline_b)

@@ -597,7 +600,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):

    # Wait long enough that the background downloads should happen; we expect all the inital layers
    # of all the initial timelines to show up on the secondary location of each tenant.
-    time.sleep(freshen_interval_secs * 1.5)
+    time.sleep(default_download_period_secs * 1.5)

    for tenant_id, timelines in tenant_timelines.items():
        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -613,8 +616,8 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        # Delete the second timeline: this should be reflected later on the secondary
        env.storage_controller.pageserver_api().timeline_delete(tenant_id, timelines[1])

-    # Wait long enough for the secondary locations to see the deletion
-    time.sleep(freshen_interval_secs * 1.5)
+    # Wait long enough for the secondary locations to see the deletion: 2x period plus a grace factor
+    time.sleep(upload_period_secs * 2.5)

    for tenant_id, timelines in tenant_timelines.items():
        attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
@@ -626,6 +629,9 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
        assert ps_secondary.list_layers(tenant_id, timelines[0])

        # This one was deleted
+        log.info(
+            f"Checking for secondary timeline deletion {tenant_id}/{timeline_id} on node {ps_secondary.id}"
+        )
        assert not ps_secondary.list_layers(tenant_id, timelines[1])

    t_end = time.time()
@@ -640,7 +646,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):

    download_rate = (total_heatmap_downloads / tenant_count) / (t_end - t_start)

-    expect_download_rate = 1.0 / freshen_interval_secs
+    expect_download_rate = 1.0 / upload_period_secs
    log.info(f"Download rate: {download_rate * 60}/min vs expected {expect_download_rate * 60}/min")

    assert download_rate < expect_download_rate * 2
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -1,16 +1,25 @@
 #
 # This file runs pg_regress-based tests.
 #
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, cast

 import pytest
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
    check_restored_datadir_content,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage

+if TYPE_CHECKING:
+    from typing import Optional
+
+    from fixtures.neon_fixtures import PgBin
+    from pytest import CaptureFixture
+

 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@@ -19,12 +28,14 @@ def test_pg_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
    build_type: str,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
    """
    :param shard_count: if None, create an unsharded tenant.  Otherwise create a tenant with this
                        many shards.
@@ -42,7 +53,7 @@ def test_pg_regress(

    # Connect to postgres and create a database called "regression".
    endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_regress to run in.
    runpath = test_output_dir / "regress"
@@ -77,7 +88,67 @@ def test_pg_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

-        check_restored_datadir_content(test_output_dir, env, endpoint)
+        ignored_files: Optional[list[str]] = None
+
+        # Neon handles unlogged relations in a special manner. During a
+        # basebackup, we ship the init fork as the main fork. This presents a
+        # problem in that the endpoint's data directory and the basebackup will
+        # have differences and will fail the eventual file comparison.
+        #
+        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+        # support for setting the persistence of a table in 9.5. The reason that
+        # this doesn't affect versions < 15 (but probably would between 9.1 and
+        # 9.5) is that all the regression tests that deal with unlogged tables
+        # up until that point dropped the unlogged tables or set them to logged
+        # at some point during the test.
+        #
+        # In version 15, Postgres grew support for unlogged sequences, and with
+        # that came a few more regression tests. These tests did not all drop
+        # the unlogged tables/sequences prior to finishing.
+        #
+        # But unlogged sequences came with a bug in that, sequences didn't
+        # inherit the persistence of their "parent" tables if they had one. This
+        # was fixed and backported to 15, thus exacerbating our problem a bit.
+        #
+        # So what we can do is just ignore file differences between the data
+        # directory and basebackup for unlogged relations.
+        results = cast(
+            "list[tuple[str, str]]",
+            endpoint.safe_psql(
+                """
+            SELECT
+                relkind,
+                pg_relation_filepath(
+                    pg_filenode_relation(reltablespace, relfilenode)
+                ) AS unlogged_relation_paths
+            FROM pg_class
+            WHERE relpersistence = 'u'
+            """,
+                dbname=DBNAME,
+            ),
+        )
+
+        unlogged_relation_files: list[str] = []
+        for r in results:
+            unlogged_relation_files.append(r[1])
+            # This is related to the following Postgres commit:
+            #
+            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+            # Date:   2023-08-23 09:21:31 -0500
+            #
+            # Use the buffer cache when initializing an unlogged index.
+            #
+            # This patch was backpatched to 16. Without it, the LSN in the
+            # page header would be 0/0 in the data directory, which wouldn't
+            # match the LSN generated during the basebackup, thus creating
+            # a difference.
+            if env.pg_version <= PgVersion.V15 and r[0] == "i":
+                unlogged_relation_files.append(f"{r[1]}_init")
+
+        ignored_files = unlogged_relation_files
+
+        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)


 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -86,8 +157,8 @@ def test_pg_regress(
 def test_isolation(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: Optional[int],
@@ -142,8 +213,8 @@ def test_isolation(
 def test_sql_regress(
    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
-    pg_bin,
-    capsys,
+    pg_bin: PgBin,
+    capsys: CaptureFixture[str],
    base_dir: Path,
    pg_distrib_dir: Path,
    shard_count: Optional[int],
--- a/test_runner/regress/test_unlogged.py
+++ b/test_runner/regress/test_unlogged.py
@@ -1,4 +1,5 @@
 from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+from fixtures.pg_version import PgVersion


 #
@@ -17,7 +18,8 @@ def test_unlogged(neon_simple_env: NeonEnv):
    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
    # create index to test unlogged index relation as well
    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
-    cur.execute("INSERT INTO iut values (42);")
+    cur.execute("ALTER TABLE iut ADD COLUMN seq int GENERATED ALWAYS AS IDENTITY;")
+    cur.execute("INSERT INTO iut (id) values (42);")

    # create another compute to fetch inital empty contents from pageserver
    fork_at_current_lsn(env, endpoint, "test_unlogged_basebackup", "test_unlogged")
@@ -26,7 +28,15 @@ def test_unlogged(neon_simple_env: NeonEnv):
    conn2 = endpoint2.connect()
    cur2 = conn2.cursor()
    # after restart table should be empty but valid
-    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut (id) VALUES ($1)")
    cur2.execute("EXECUTE iut_plan (43);")
    cur2.execute("SELECT * FROM iut")
-    assert cur2.fetchall() == [(43,)]
+    results = cur2.fetchall()
+    # Unlogged sequences were introduced in v15. On <= v14, the sequence created
+    # for the GENERATED ALWAYS AS IDENTITY column is logged, and hence it keeps
+    # the old value (2) on restart. While on v15 and above, it's unlogged, so it
+    # gets reset to 1.
+    if env.pg_version <= PgVersion.V14:
+        assert results == [(43, 2)]
+    else:
+        assert results == [(43, 1)]
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.2", "1f63dd206a8aaa4727baad334c548219c52878e1"],
-  "v15": ["15.6", "6fd679f5154d12f4892ddd450cc6be28a8ac31b0"],
-  "v14": ["14.11", "a6dc3f010da31472a7ae9ab0ddfbf6e49131d93c"]
+  "v16": ["16.3", "c271017c6c4846be59948766baec2ba4ace5dc9c"],
+  "v15": ["15.7", "e2dbd63345c584de75173c27951f111249ae0016"],
+  "v14": ["14.12", "21ec61d539d22a81fe811c2d79e26436820bc3f4"]
 }
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -254,8 +254,8 @@ files:
          select
            case
              when pg_catalog.pg_is_in_recovery()
-              then pg_last_wal_replay_lsn()
-              else pg_current_wal_lsn()
+              then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+              else (pg_current_wal_lsn() - '0/0')::FLOAT8
            end as lsn;

      - metric_name: replication_delay_bytes
@@ -294,6 +294,9 @@ files:
        query: |
          SELECT checkpoints_timed FROM pg_stat_bgwriter;

+      # In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
+      # It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
+
      # Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
      - metric_name: logical_slot_restart_lsn
        type: gauge
@@ -302,7 +305,32 @@ files:
          - slot_name
        values: [restart_lsn]
        query: |
-          select slot_name, restart_lsn from pg_replication_slots where slot_type = 'logical';
+          select slot_name, (restart_lsn - '0/0')::FLOAT8 from pg_replication_slots where slot_type = 'logical';
+
+      - metric_name: retained_wal
+        type: gauge
+        help: 'Retained WAL in inactive replication slots'
+        key_labels:
+          - slot_name
+        values: [retained_wal]
+        query: |
+          SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+          FROM pg_replication_slots
+          WHERE active = false;
+
+      - metric_name: wal_is_lost
+        type: gauge
+        help: 'Whether or not the replication slot\'s wal_status is lost'
+        key_labels:
+          - slot_name
+        values: [wal_status_is_lost]
+        query: |
+          SELECT slot_name,
+          CASE
+            WHEN wal_status = 'lost' THEN 1
+            ELSE 0
+          END AS wal_status_is_lost
+          FROM pg_replication_slots;
  - filename: neon_collector_autoscaling.yml
    content: |
      collector_name: neon_collector_autoscaling
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -51,7 +51,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -102,7 +102,7 @@ num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
-parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
Author	SHA1	Message	Date
Alex Chi Z	496e96cdff	Update pageserver/src/tenant.rs Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2024-05-21 13:32:57 -04:00
Alex Chi Z	63db18101e	feat(pageserver): remove aux v1 keyspace if user fully switches to v2 Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-21 11:21:27 -04:00
John Spray	353afe4fe7	neon_local: run controller's postgres with fsync=off (#7817 ) ## Problem In `test_storage_controller_many_tenants` we [occasionally](https://neon-github-public-dev.s3.amazonaws.com/reports/main/9155810417/index.html#/testresult/8fbdf57a0e859c2d) see it hit the retry limit on serializable transactions. That's likely due to a combination of relative slow fsync on the hetzner nodes running the test, and the way the test does lots of parallel timeline creations, putting high load on the drive. Running the storage controller's db with fsync=off may help here. ## Summary of changes - Set `fsync=off` in the postgres config for the database used by the storage controller in tests	2024-05-21 18:13:54 +03:00
Tristan Partin	1988ad8db7	Extend test_unlogged to include a sequence Unlogged sequences were added in v15, so let's just test to make sure they work on Neon.	2024-05-21 09:18:11 -05:00
Tristan Partin	e3415706b7	Upgrade Postgres v16 to 16.3	2024-05-21 09:18:11 -05:00
Tristan Partin	9d081851ec	Upgrade Postgres v15 to 15.7	2024-05-21 09:18:11 -05:00
Tristan Partin	781352bd8e	Upgrade Postgres v14 to 14.12	2024-05-21 09:18:11 -05:00
Tristan Partin	8030b8e4c5	Fix test_pg_regress for unlogged relations Previously we worked around file comparison issues by dropping unlogged relations in the pg_regress tests, but this would lead to an unnecessary diff when compared to upstream in our Postgres fork. Instead, we can precompute the files that we know will be different, and ignore them.	2024-05-21 09:18:11 -05:00
Tristan Partin	9a4b896636	Use a constant for database name in test_pg_regress	2024-05-21 09:18:11 -05:00
Tristan Partin	e8b8ebfa1d	Allow check_restored_datadir_content to ignore certain files Some files may have known differences that we are okay with.	2024-05-21 09:18:11 -05:00
Tristan Partin	d9d471e3c4	Add some Python typing in a few test files	2024-05-21 09:18:11 -05:00
Arseny Sher	d43dcceef9	Minimize hot standby feedback xmins to next_xid. Hot standby feedback xmins can be greater than next_xid due to sparse update of nextXid on pageserver (to do less writes it advances next xid on 1024). ProcessStandbyHSFeedback ignores such xids from the future; to fix, minimize received xmin to next_xid. Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-05-21 16:21:29 +03:00
Arseny Sher	f2771a99b7	Add metric for pageserver standby horizon. Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-05-21 16:21:29 +03:00
Arseny Sher	f54c3b96e0	Fix bugs in hot standby feedback propagation and add test for it. Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-05-21 16:21:29 +03:00
Arseny Sher	478cc37a70	Propagate standby apply LSN to pageserver to hold off GC. To avoid pageserver gc'ing data needed by standby, propagate standby apply LSN through standby -> safekeeper -> broker -> pageserver flow and hold off GC for it. Iteration of GC resets the value to remove the horizon when standby goes away -- pushes are assumed to happen at least once between gc iterations. As a safety guard max allowed lag compared to normal GC horizon is hardcoded as 10GB. Add test for the feature. Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2024-05-21 16:21:29 +03:00
John Spray	4ce6e2d2fc	pageserver: fix secondary progress stats when layers are 404 (#7814 ) ## Problem Noticed this issue in staging. When a tenant is under somewhat heavy timeline creation/deletion thrashing, it becomes quite common for secondary downloads to encounter 404s downloading layers. This is tolerated by design, because heatmaps are not guaranteed to be up to date with what layers/timelines actually exist. However, we were not updating the SecondaryProgress structure in this case, so after such a download pass, we would leave a SecondaryProgress state with lower "downloaded" stats than "total" stats. This causes the storage controller to consider this secondary location inelegible for optimization actions such as we do after shard splits This issue has relative low impact because a typical tenant will eventually upload a heatmap where we do download all the layers and thereby enable the controller to progress with migrations -- the heavy thrashing of timeline creation/deletion is an artifact of our nightly stress tests. ## Summary of changes - In the layer 404 case, subtract the skipped layer's stats from the totals, so that at the end of this download pass we should still end up in a complete state. - When updating `last_downloaded`, do a sanity check that our progress is complete. In debug builds, assert out if this is not the case. In prod builds, correct the stats and log a warning.	2024-05-21 13:46:04 +01:00
dependabot[bot]	baeb58432f	build(deps): bump requests from 2.31.0 to 2.32.0 (#7816 )	2024-05-21 10:48:17 +00:00
Sasha Krassovsky	6f3e043a76	Add some more replication slot metrics (#7761 ) ## Problem We want to add alerts for when people's replication slots break, and also metrics for retained WAL so that we can make warn customers when their storage gets bloated. ## Summary of changes Adds the metrics. Addresses https://github.com/neondatabase/neon/issues/7593	2024-05-21 00:00:47 +00:00
Alex Chi Z	6810d2aa53	feat(pageserver): do not read past image layers for vectored get (#7773 ) ## Problem Part of https://github.com/neondatabase/neon/issues/7462 On metadata keyspace, vectored get will not stop if a key is not found, and will read past the image layer. However, the semantics is different from single get, because if a key does not exist in the image layer, it means that the key does not exist in the past, or have been deleted. This pull request fixed it by recording image layer coverage during the vectored get process and stop when the full keyspace is covered by an image layer. A corresponding test case is added to ensure generating image layer reduces the number of delta layers. This optimization (or bug fix) also applies to rel block keyspaces. If a key is missing, we can know it's missing once the first image layer is reached. Page server will not attempt to read lower layers, which potentially incurs layer downloads + evictions. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-20 14:24:18 -04:00
Andy Hattemer	2d7091871f	Update banner image in Readme (#7801 ) Update the readme banner with updated branding.	2024-05-20 12:15:43 -04:00
Alex Chi Z	7701ca45dd	feat(pageserver): generate image layers for sparse keyspace (#7567 ) Part of https://github.com/neondatabase/neon/issues/7462 Sparse keyspace does not generate image layers for now. This pull request adds support for generating image layers for sparse keyspace. ## Summary of changes * Use the scan interface to generate compaction data for sparse keyspace. * Track num of delta layers reads during scan. * Read-trigger compaction: when a scan on the keyspace touches too many delta files, generate an image layer. There are one hard-coded threshold for now: max delta layers we want to touch for a scan. * L0 compaction does not need to compute holes for metadata keyspace. Know issue: the scan interface currently reads past the image layer, which causes `delta_layer_accessed` keeps increasing even if image layers are generated. The pull request to fix that will be separate, and orthogonal to this one. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-05-20 16:08:45 +00:00
Arseny Sher	de8dfee4bd	safekeeper: log LSNs on walreceiver/walsender exit. Useful for observability.	2024-05-20 15:43:10 +03:00
Arseny Sher	e3f51abadf	safekeeper: close connection when COPY stream ends. We can't gracefully exit COPY mode (and don't need that), so close connection to prevent further attempts to use it.	2024-05-20 15:43:10 +03:00
Peter Bendel	a7b84cca5a	Upgrade of pgvector to 0.7.0 (#7726 ) Upgrade pgvector to 0.7.0. This PR is based on Heikki's PR #6753 and just uses pgvector 0.7.0 instead of 0.6.0 I have now done all planned manual tests. The pull request is ready to be reviewed and merged and can be deployed in production together / after swap enablement. See (https://github.com/neondatabase/autoscaling/issues/800) Fixes https://github.com/neondatabase/neon/issues/6516 Fixes https://github.com/neondatabase/neon/issues/7780 ## Documentation input for usage recommendations ### maintenance_work_mem In Neon `maintenance_work_mem` is very small by default (depends on configured RAM for your compute but can be as low as 64 MB). To optimize pgvector index build time you may have to bump it up according to your working set size (size of tuples for vector index creation). You can do so in the current session using `SET maintenance_work_mem='10 GB';` The target value you choose should fit into the memory of your compute size and not exceed 50-60% of available RAM. The value above has been successfully used on a 7CU endpoint. ### max_parallel_maintenance_workers max_parallel_maintenance_workers is also small by default (2). For efficient parallel pgvector index creation you have to bump it up with `SET max_parallel_maintenance_workers = 7` to make use of all the CPUs available, assuming you have configured your endpoint to use 7CU. ## ID input for changelog pgvector extension in Neon has been upgraded from version 0.5.1 to version 0.7.0. Please see https://github.com/pgvector/pgvector/ for documentation of new capabilities in pgvector version 0.7.0 If you have existing databases with pgvector 0.5.1 already installed there is a slight difference in behavior in the following corner cases even if you don't run `ALTER EXTENSION UPDATE`: ### L2 distance from NULL::vector For the following script, comparing the NULL::vector to non-null vectors the resulting output changes: ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING hnsw (val vector_l2_ops); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` and now the output is ``` val --------- [1,1,1] [1,2,4] [1,2,3] [0,0,0] (4 rows) ``` For the following script ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING ivfflat (val vector_l2_ops) WITH (lists = 1); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` the output now is ``` val --------- [0,0,0] [1,2,3] [1,1,1] [1,2,4] (4 rows) ``` ### changed error messages If you provide invalid literals for datatype vector you may get improved/changed error messages, for example: ```sql neondb=> SELECT '[4e38,1]'::vector; ERROR: "4e38" is out of range for type vector LINE 1: SELECT '[4e38,1]'::vector; ^ ``` --------- Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2024-05-20 12:07:25 +02:00
John Spray	291fcb9e4f	pageserver: use the heatmap upload interval to set the secondary download interval (#7793 ) ## Problem The heatmap upload period is configurable, but secondary mode downloads were using a fixed download period. Closes: #6200 ## Summary of changes - Use the upload period in the heatmap to adjust the download period. In practice, this will reduce the frequency of downloads from its current 60 second period to what heatmaps use, which is 5-10m depending on environment. This is an improvement rather than being optimal: we could be smarter about periods, and schedule downloads to occur around the time we expect the next upload, rather than just using the same period, but that's something we can address in future if it comes up.	2024-05-20 09:25:25 +01:00
Conrad Ludgate	a5ecca976e	proxy: bump parquet (#7782 ) ## Summary of changes Updates the parquet lib. one change left that we need is in an open PR against upstream, hopefully we can remove the git dependency by 52.0.0 https://github.com/apache/arrow-rs/pull/5773 I'm not sure why the parquet files got a little bit bigger. I tested them and they still open fine. 🤷 side effect of the update, chrono updated and added yet another deprecation warning (hence why the safekeepers change)	2024-05-19 19:45:53 +00:00
Heikki Linnakangas	5caee4ca54	Fix calculation in test The comment says that this checks if there's enough space on the page for logical message and an XLOG_SWITCH. So the sizes of the logical message and the XLOG_SWITCH record should be added together, not subtracted. I saw a panic in the test that led me to investigate and notice this (https://neon-github-public-dev.s3.amazonaws.com/reports/pr-7803/9142396223/index.html): RuntimeError: Run ['/tmp/neon/bin/wal_craft', 'in-existing', 'last_wal_record_xlog_switch_ends_on_page_boundary', "host=localhost port=16165 user=cloud_admin dbname=postgres options='-cstatement_timeout=120s '"] failed: stdout: stderr: thread 'main' panicked at libs/postgres_ffi/wal_craft/src/lib.rs:370:27: attempt to subtract with overflow stack backtrace: 0: rust_begin_unwind at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/std/src/panicking.rs:645:5 1: core::panicking::panic_fmt at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:72:14 2: core::panicking::panic at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/panicking.rs:145:5 3: <wal_craft::LastWalRecordXlogSwitchEndsOnPageBoundary as wal_craft::Crafter>::craft::<postgres::client::Client> at libs/postgres_ffi/wal_craft/src/lib.rs:370:27 4: wal_craft::main::{closure#0} at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:21:17 5: wal_craft::main at libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs:66:47 6: <fn() -> core::result::Result<(), anyhow::Error> as core::ops::function::FnOnce<()>>::call_once at /rustc/9b00956e56009bab2aa15d7bff10916599e3d6d6/library/core/src/ops/function.rs:250:5 note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.	2024-05-19 21:49:51 +03:00