Merge branch 'main' into vk/pg_embedding_v16_support

Update checksum from pg_embedding source code
pg_embedding with Pg16 support
2026-02-08 21:20:38 +00:00 · 2023-09-12 20:09:12 +01:00 · 2023-09-12 20:08:20 +01:00 · 2023-09-11 13:02:52 +02:00
57 changed files with 1286 additions and 2910 deletions
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -16,29 +16,21 @@ on:
      # Actual magic happens here:
      - labeled

-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"

-# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
-permissions: {}
+permissions: write-all

-defaults:
-  run:
-    shell: bash -euo pipefail {0}
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}

 jobs:
  remove-label:
    # Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
    # The PR should be reviewed and labelled manually again.

-    permissions:
-      pull-requests: write # For `gh pr edit`
-
    if: |
      contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -51,10 +43,6 @@ jobs:
  create-or-update-pr-for-ci-run:
    # Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.

-    permissions:
-      pull-requests: write # for `gh pr edit`
-      # For `git push` and `gh pr create` we use CI_ACCESS_TOKEN
-
    if: |
      github.event.action == 'labeled' &&
      contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -87,7 +75,7 @@ jobs:
            Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
          EOF

-          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
+          ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
          if [ -z "${ALREADY_CREATED}" ]; then
            gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
                                                       --body-file "body.md" \
@@ -99,10 +87,6 @@ jobs:
  cleanup:
    # Close PRs and delete branchs if the original PR is closed.

-    permissions:
-      contents: write # for `--delete-branch` flag in `gh pr close`
-      pull-requests: write # for `gh pr close`
-
    if: |
      github.event.action == 'closed' &&
      github.event.pull_request.head.repo.full_name != github.repository
@@ -110,9 +94,8 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-      - name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
-        run: |
-          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
+      - run: |
+          CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
          if [ "${CLOSED}" == "false" ]; then
            gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
          fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.10
+      VM_BUILDER_VERSION: v0.17.5

    steps:
      - name: Checkout
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -124,21 +124,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
    apt install -y ninja-build python3-dev libncurses5 binutils clang

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLV8_VERSION=3.1.5 \
-        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
-        ;; \
-      "v16") \
-        export PLV8_VERSION=3.1.8 \
-        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
-    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
+    echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
    export PATH="/usr/local/pgsql/bin:$PATH" && \
    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -429,7 +416,7 @@ RUN case "${PG_VERSION}" in \
        ;; \
      "v16") \
        export PG_HINT_PLAN_VERSION=16_1_6_0 \
-        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
+        export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
        ;; \
      *) \
        echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
@@ -579,8 +566,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
      "v14" | "v15") \
-        export PG_EMBEDDING_VERSION=0.3.5 \
-        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
+        export PG_EMBEDDING_VERSION=0.3.6 \
+        export PG_EMBEDDING_CHECKSUM=b2e2b359335d26987778c7fae0c9bcc8ebc3530fc214113be1ddbc8a136e52ac \
        ;; \
      *) \
        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -630,27 +630,25 @@ impl ComputeNode {
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
-    pub fn start_postgres(&self, pspec: &ParsedSpec) -> Result<std::process::Child> {
+    pub fn start_postgres(
+        &self,
+        storage_auth_token: Option<String>,
+    ) -> Result<std::process::Child> {
        let pgdata_path = Path::new(&self.pgdata);

-        let spec = &pspec.spec;
        // Run postgres as a child process.
-        let mut pg_command = maybe_cgexec(&self.pgbin);
-        pg_command.args(["-D", &self.pgdata]);
+        let mut pg = maybe_cgexec(&self.pgbin)
+            .args(["-D", &self.pgdata])
+            .envs(if let Some(storage_auth_token) = &storage_auth_token {
+                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
+            } else {
+                vec![]
+            })
+            .spawn()
+            .expect("cannot start postgres process");

-        if let Some(storage_auth_token) = &spec.storage_auth_token {
-            pg_command.env("NEON_AUTH_TOKEN", storage_auth_token);
-        }
-        if let Some(env_vars) = &spec.env_vars {
-            pg_command.envs(
-                env_vars
-                    .iter()
-                    .map(|(k, v)| (k, v.clone().unwrap_or(String::new()))),
-            );
-        }
-
-        let mut pg = pg_command.spawn().expect("cannot start postgres process");
        wait_for_postgres(&mut pg, pgdata_path)?;
+
        Ok(pg)
    }

@@ -799,7 +797,7 @@ impl ComputeNode {
        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
-        let pg = self.start_postgres(pspec)?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -500,7 +500,6 @@ impl Endpoint {
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
            remote_extensions: None,
-            env_vars: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -58,6 +58,7 @@ pub struct ComputeSpec {
    pub pageserver_connstring: Option<String>,
    #[serde(default)]
    pub safekeeper_connstrings: Vec<String>,
+
    #[serde(default)]
    pub mode: ComputeMode,

@@ -67,9 +68,6 @@ pub struct ComputeSpec {

    // information about available remote extensions
    pub remote_extensions: Option<RemoteExtSpec>,
-
-    // Environment variables to set on the compute
-    pub env_vars: Option<HashMap<String, Option<String>>>,
 }

 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -243,9 +243,5 @@
        "public_extensions": [
          "postgis"
        ]
-    },
-    "env_vars": {
-        "OPENAI_API_KEY" : "i hope heikki has a good day today",
-        "MY_NULL_ENV_VAR" : null
-    }
+      }
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -3,9 +3,9 @@
 //!
 use chrono::{DateTime, Utc};
 use rand::Rng;
-use serde::{Deserialize, Serialize};
+use serde::Serialize;

-#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -27,8 +27,7 @@ impl EventType {
    }

    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull, at least pageserver creates
-        // incremental ranges where the stop and next start are equal.
+        // these can most likely be thought of as Range or RangeFull
        use EventType::*;
        match self {
            Incremental {
@@ -42,25 +41,15 @@ impl EventType {
    pub fn is_incremental(&self) -> bool {
        matches!(self, EventType::Incremental { .. })
    }
-
-    /// Returns the absolute time, or for incremental ranges, the stop time.
-    pub fn recorded_at(&self) -> &DateTime<Utc> {
-        use EventType::*;
-
-        match self {
-            Absolute { time } => time,
-            Incremental { stop_time, .. } => stop_time,
-        }
-    }
 }

-#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub struct Event<Extra, Metric> {
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+pub struct Event<Extra> {
    #[serde(flatten)]
    #[serde(rename = "type")]
    pub kind: EventType,

-    pub metric: Metric,
+    pub metric: &'static str,
    pub idempotency_key: String,
    pub value: u64,

@@ -69,38 +58,12 @@ pub struct Event<Extra, Metric> {
 }

 pub fn idempotency_key(node_id: &str) -> String {
-    IdempotencyKey::generate(node_id).to_string()
-}
-
-/// Downstream users will use these to detect upload retries.
-pub struct IdempotencyKey<'a> {
-    now: chrono::DateTime<Utc>,
-    node_id: &'a str,
-    nonce: u16,
-}
-
-impl std::fmt::Display for IdempotencyKey<'_> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
-    }
-}
-
-impl<'a> IdempotencyKey<'a> {
-    pub fn generate(node_id: &'a str) -> Self {
-        IdempotencyKey {
-            now: Utc::now(),
-            node_id,
-            nonce: rand::thread_rng().gen_range(0..=9999),
-        }
-    }
-
-    pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
-        IdempotencyKey {
-            now,
-            node_id,
-            nonce,
-        }
-    }
+    format!(
+        "{}-{}-{:04}",
+        Utc::now(),
+        node_id,
+        rand::thread_rng().gen_range(0..=9999)
+    )
 }

 pub const CHUNK_SIZE: usize = 1000;
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -137,12 +137,9 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
 pub const XLOG_HEAP_DELETE: u8 = 0x10;
 pub const XLOG_HEAP_UPDATE: u8 = 0x20;
 pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
-pub const XLOG_HEAP_LOCK: u8 = 0x60;
 pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
 pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
 pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
-pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
-pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
 pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
 pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,8 +315,12 @@ impl CgroupWatcher {
    where
        E: Stream<Item = Sequenced<u64>>,
    {
+        // There are several actions might do when receiving a `memory.high`,
+        // such as freezing the cgroup, or increasing its `memory.high`. We don't
+        // want to do these things too often (because postgres needs to run, and
+        // we only have so much memory). These timers serve as rate limits for this.
        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
+        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
        let mut events = pin!(events);

        // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -328,8 +332,6 @@ impl CgroupWatcher {
                upscale = upscales.recv() => {
                    let Sequenced { seqnum, data } = upscale
                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                }
@@ -394,17 +396,12 @@ impl CgroupWatcher {
                            .send(())
                            .await
                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
                        continue;
                    }

                    // Shoot, we can't freeze or and we're still waiting on upscale,
                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
+                    if wait_to_increase_memory_high.is_elapsed() {
                        info!(
                            "received memory.high event, \
                            but too soon to refreeze and already requested upscale \
@@ -440,11 +437,12 @@ impl CgroupWatcher {
                        );
                        self.set_high_bytes(new_high)
                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
+                        wait_to_increase_memory_high
+                            .as_mut()
+                            .reset(Instant::now() + self.config.memory_high_increase_every)
                    }

-                    info!("received memory.high event, but can't do anything");
+                    // we can't do anything
                }
            };
        }
@@ -561,7 +559,14 @@ impl CgroupWatcher {
 /// Setting these values also affects the thresholds for receiving usage alerts.
 #[derive(Debug)]
 pub struct MemoryLimits {
-    pub high: u64,
+    high: u64,
+    max: u64,
+}
+
+impl MemoryLimits {
+    pub fn new(high: u64, max: u64) -> Self {
+        Self { max, high }
+    }
 }

 // Methods for manipulating the actual cgroup
@@ -638,7 +643,12 @@ impl CgroupWatcher {

    /// Set cgroup memory.high and memory.max.
    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(limits.high, path = self.path(), "writing new memory limits",);
+        info!(
+            limits.high,
+            limits.max,
+            path = self.path(),
+            "writing new memory limits",
+        );
        self.memory()
            .context("failed to get memory subsystem while setting memory limits")?
            .set_mem(cgroups_rs::memory::SetMemory {
@@ -647,7 +657,7 @@ impl CgroupWatcher {
                high: Some(MaxValue::Value(
                    u64::min(limits.high, i64::MAX as u64) as i64
                )),
-                max: None,
+                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
            })
            .context("failed to set memory limits")
    }
@@ -655,7 +665,7 @@ impl CgroupWatcher {
    /// Given some amount of available memory, set the desired cgroup memory limits
    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits { high: new_high };
+        let limits = MemoryLimits::new(new_high, available_memory);
        info!(
            path = self.path(),
            memory = ?limits,
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -257,11 +257,12 @@ impl Runner {
                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            }

-            let limits = MemoryLimits {
+            let limits = MemoryLimits::new(
                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
                // since it is properly initialized in the previous cgroup if let block
-                high: new_cgroup_mem_high,
-            };
+                new_cgroup_mem_high,
+                available_memory,
+            );
            cgroup
                .set_limits(&limits)
                .context("failed to set cgroup memory limits")?;
@@ -327,9 +328,7 @@ impl Runner {
                name = cgroup.path(),
                "updating cgroup memory.high",
            );
-            let limits = MemoryLimits {
-                high: new_cgroup_mem_high,
-            };
+            let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
            cgroup
                .set_limits(&limits)
                .context("failed to set file cache size")?;
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -80,11 +80,11 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
 hex-literal.workspace = true
+tempfile.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }

 [[bench]]
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -518,9 +518,6 @@ fn start_pageserver(
            // creates a child context with the right DownloadBehavior.
            DownloadBehavior::Error,
        );
-
-        let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
-
        task_mgr::spawn(
            crate::BACKGROUND_RUNTIME.handle(),
            TaskKind::MetricsCollection,
@@ -547,7 +544,6 @@ fn start_pageserver(
                    conf.cached_metric_collection_interval,
                    conf.synthetic_size_calculation_interval,
                    conf.id,
-                    local_disk_storage,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -64,7 +64,7 @@ pub mod defaults {
        super::ConfigurableSemaphore::DEFAULT_INITIAL.get();

    pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
-    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
+    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -1,54 +1,188 @@
+//!
 //! Periodically collect consumption metrics for all active tenants
 //! and push them to a HTTP endpoint.
+//! Cache metrics to send only the updated ones.
+//!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
-use consumption_metrics::EventType;
+use anyhow;
+use chrono::{DateTime, Utc};
+use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
+use serde::Serialize;
+use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tracing::*;
-use utils::id::NodeId;
-
-mod metrics;
-use metrics::MetricsKey;
-mod disk_cache;
-mod upload;
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-/// Basically a key-value pair, but usually in a Vec except for [`Cache`].
-///
-/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form.
-/// Difference is basically the missing idempotency key, which lives only for the duration of
-/// upload attempts.
-type RawMetric = (MetricsKey, (EventType, u64));
+#[serde_as]
+#[derive(Serialize, Debug, Clone, Copy)]
+struct Ids {
+    #[serde_as(as = "DisplayFromStr")]
+    tenant_id: TenantId,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    timeline_id: Option<TimelineId>,
+}

-/// Caches the [`RawMetric`]s
-///
-/// In practice, during startup, last sent values are stored here to be used in calculating new
-/// ones. After successful uploading, the cached values are updated to cache. This used to be used
-/// for deduplication, but that is no longer needed.
-type Cache = HashMap<MetricsKey, (EventType, u64)>;
+/// Key that uniquely identifies the object, this metric describes.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
+}
+
+impl MetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(MetricsKey);
+
+impl AbsoluteValueFactory {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(MetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (MetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &MetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
+}

 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
-    _cached_metric_collection_interval: Duration,
+    cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
-    local_disk_storage: PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    if _cached_metric_collection_interval != Duration::ZERO {
-        tracing::warn!(
-            "cached_metric_collection_interval is no longer used, please set it to zero."
-        )
-    }
+    let mut ticker = tokio::time::interval(metric_collection_interval);
+    info!("starting collect_metrics");

    // spin up background worker that caclulates tenant sizes
    let worker_ctx =
@@ -68,216 +202,543 @@ pub async fn collect_metrics(
        },
    );

-    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
-
-    let cancel = task_mgr::shutdown_token();
-
-    let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
-
-    let mut cached_metrics = tokio::select! {
-        _ = cancel.cancelled() => return Ok(()),
-        ret = restore_and_reschedule => ret,
-    };
-
    // define client here to reuse it for all requests
    let client = reqwest::ClientBuilder::new()
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-
-    let node_id = node_id.to_string();
-
-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
+    let mut cached_metrics = HashMap::new();
+    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                info!("collect_metrics received cancellation request");
+                return Ok(());
+            },
+            tick_at = ticker.tick() => {

-        // these are point in time, with variable "now"
-        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
+                // send cached metrics every cached_metric_collection_interval
+                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;

-        if metrics.is_empty() {
-            continue;
+                if send_cached {
+                    prev_iteration_time = std::time::Instant::now();
+                }
+
+                collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    metric_collection_interval,
+                    "consumption_metrics_collect_metrics",
+                );
+            }
        }
-
-        let metrics = Arc::new(metrics);
-
-        // why not race cancellation here? because we are one of the last tasks, and if we are
-        // already here, better to try to flush the new values.
-
-        let flush = async {
-            match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
-                Ok(()) => {
-                    tracing::debug!("flushed metrics to disk");
-                }
-                Err(e) => {
-                    // idea here is that if someone creates a directory as our path, then they
-                    // might notice it from the logs before shutdown and remove it
-                    tracing::error!("failed to persist metrics to {path:?}: {e:#}");
-                }
-            }
-        };
-
-        let upload = async {
-            let res = upload::upload_metrics(
-                &client,
-                metric_collection_endpoint,
-                &cancel,
-                &node_id,
-                &metrics,
-                &mut cached_metrics,
-            )
-            .await;
-            if let Err(e) = res {
-                // serialization error which should never happen
-                tracing::error!("failed to upload due to {e:#}");
-            }
-        };
-
-        // let these run concurrently
-        let (_, _) = tokio::join!(flush, upload);
-
-        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
-            metric_collection_interval,
-            "consumption_metrics_collect_metrics",
-        );
    }
 }

-/// Called on the first iteration in an attempt to join the metric uploading schedule from previous
-/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts.
+/// One iteration of metrics collection
 ///
-/// Cancellation safe.
-async fn restore_and_reschedule(
-    path: &Arc<PathBuf>,
-    metric_collection_interval: Duration,
-) -> Cache {
-    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
-        Ok(found_some) => {
-            // there is no min needed because we write these sequentially in
-            // collect_all_metrics
-            let earlier_metric_at = found_some
-                .iter()
-                .map(|(_, (et, _))| et.recorded_at())
-                .copied()
-                .next();
+/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
+/// Cache metrics to avoid sending the same metrics multiple times.
+///
+/// This function handles all errors internally
+/// and doesn't break iteration if just one tenant fails.
+///
+/// TODO
+/// - refactor this function (chunking+sending part) to reuse it in proxy module;
+async fn collect_metrics_iteration(
+    client: &reqwest::Client,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
+    metric_collection_endpoint: &reqwest::Url,
+    node_id: NodeId,
+    ctx: &RequestContext,
+    send_cached: bool,
+) {
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
+    trace!(
+        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
+        metric_collection_endpoint
+    );

-            let cached = found_some.into_iter().collect::<Cache>();
-
-            (cached, earlier_metric_at)
-        }
-        Err(e) => {
-            use std::io::{Error, ErrorKind};
-
-            let root = e.root_cause();
-            let maybe_ioerr = root.downcast_ref::<Error>();
-            let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
-
-            if !is_not_found {
-                tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
-            }
-
-            (HashMap::new(), None)
+    // get list of tenants
+    let tenants = match mgr::list_tenants().await {
+        Ok(tenants) => tenants,
+        Err(err) => {
+            error!("failed to list tenants: {:?}", err);
+            return;
        }
    };

-    if let Some(earlier_metric_at) = earlier_metric_at {
-        let earlier_metric_at: SystemTime = earlier_metric_at.into();
+    // iterate through list of Active tenants and collect metrics
+    for (tenant_id, tenant_state) in tenants {
+        if tenant_state != TenantState::Active {
+            continue;
+        }

-        let error = reschedule(earlier_metric_at, metric_collection_interval).await;
-
-        if let Some(error) = error {
-            if error.as_secs() >= 60 {
-                tracing::info!(
-                    error_ms = error.as_millis(),
-                    "startup scheduling error due to restart"
-                )
+        let tenant = match mgr::get_tenant(tenant_id, true).await {
+            Ok(tenant) => tenant,
+            Err(err) => {
+                // It is possible that tenant was deleted between
+                // `list_tenants` and `get_tenant`, so just warn about it.
+                warn!("failed to get tenant {tenant_id:?}: {err:?}");
+                continue;
            }
+        };
+
+        let mut tenant_resident_size = 0;
+
+        // iterate through list of timelines in tenant
+        for timeline in tenant.list_timelines() {
+            // collect per-timeline metrics only for active timelines
+
+            let timeline_id = timeline.timeline_id;
+
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
+                        tenant_id,
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
+            }
+
+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
+
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
+
+        // Note that this metric is calculated in a separate bgworker
+        // Here we only use cached value, which may lag behind the real latest one
+        let synthetic_size = tenant.cached_synthetic_size();
+
+        if synthetic_size != 0 {
+            // only send non-zeroes because otherwise these show up as errors in logs
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

-    cached
+    // Filter metrics, unless we want to send all metrics, including cached ones.
+    // See: https://github.com/neondatabase/neon/issues/3485
+    if !send_cached {
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
+        });
+    }
+
+    if current_metrics.is_empty() {
+        trace!("no new metrics to send");
+        return;
+    }
+
+    // Send metrics.
+    // Split into chunks of 1000 metrics to avoid exceeding the max request size
+    let chunks = current_metrics.chunks(CHUNK_SIZE);
+
+    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
+
+    let node_id = node_id.to_string();
+
+    for chunk in chunks {
+        chunk_to_send.clear();
+
+        // enrich metrics with type,timestamp and idempotency key before sending
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
+            metric: curr_key.metric,
+            idempotency_key: idempotency_key(&node_id),
+            value: *curr_val,
+            extra: Ids {
+                tenant_id: curr_key.tenant_id,
+                timeline_id: curr_key.timeline_id,
+            },
+        }));
+
+        const MAX_RETRIES: u32 = 3;
+
+        for attempt in 0..MAX_RETRIES {
+            let res = client
+                .post(metric_collection_endpoint.clone())
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
+                .send()
+                .await;
+
+            match res {
+                Ok(res) => {
+                    if res.status().is_success() {
+                        // update cached metrics after they were sent successfully
+                        for (curr_key, curr_val) in chunk.iter() {
+                            cached_metrics.insert(curr_key.clone(), *curr_val);
+                        }
+                    } else {
+                        error!("metrics endpoint refused the sent metrics: {:?}", res);
+                        for metric in chunk_to_send
+                            .iter()
+                            .filter(|metric| metric.value > (1u64 << 40))
+                        {
+                            // Report if the metric value is suspiciously large
+                            error!("potentially abnormal metric value: {:?}", metric);
+                        }
+                    }
+                    break;
+                }
+                Err(err) if err.is_timeout() => {
+                    error!(attempt, "timeout sending metrics, retrying immediately");
+                    continue;
+                }
+                Err(err) => {
+                    error!(attempt, ?err, "failed to send metrics");
+                    break;
+                }
+            }
+        }
+    }
 }

-async fn reschedule(
-    earlier_metric_at: SystemTime,
-    metric_collection_interval: Duration,
-) -> Option<Duration> {
-    let now = SystemTime::now();
-    match now.duration_since(earlier_metric_at) {
-        Ok(from_last_send) if from_last_send < metric_collection_interval => {
-            let sleep_for = metric_collection_interval - from_last_send;
+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}

-            let deadline = std::time::Instant::now() + sleep_for;
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;

-            tokio::time::sleep_until(deadline.into()).await;
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();

-            let now = std::time::Instant::now();
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };

-            // executor threads might be busy, add extra measurements
-            Some(if now < deadline {
-                deadline - now
-            } else {
-                now - deadline
-            })
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
        }
-        Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)),
-        Err(_) => {
-            tracing::warn!(
-                ?now,
-                ?earlier_metric_at,
-                "oldest recorded metric is in future; first values will come out with inconsistent timestamps"
-            );
-            earlier_metric_at.duration_since(now).ok()
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
        }
    }
 }

 /// Caclculate synthetic size for each active tenant
-async fn calculate_synthetic_size_worker(
+pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");

-    // reminder: ticker is ready immediatedly
    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
-    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => {
+                return Ok(());
+            },
+            tick_at = ticker.tick() => {

-        let tenants = match mgr::list_tenants().await {
-            Ok(tenants) => tenants,
-            Err(e) => {
-                warn!("cannot get tenant list: {e:#}");
-                continue;
-            }
-        };
+                let tenants = match mgr::list_tenants().await {
+                    Ok(tenants) => tenants,
+                    Err(e) => {
+                        warn!("cannot get tenant list: {e:#}");
+                        continue;
+                    }
+                };
+                // iterate through list of Active tenants and collect metrics
+                for (tenant_id, tenant_state) in tenants {

-        for (tenant_id, tenant_state) in tenants {
-            if tenant_state != TenantState::Active {
-                continue;
-            }
+                    if tenant_state != TenantState::Active {
+                        continue;
+                    }
+
+                    if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
+                    {
+                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
+                            ctx).await {
+                            error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
+                        }
+                    }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
-                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
+
+                crate::tenant::tasks::warn_when_period_overrun(
+                    tick_at.elapsed(),
+                    synthetic_size_calculation_interval,
+                    "consumption_metrics_synthetic_size_worker",
+                );
            }
        }
-
-        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
-            synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
-        );
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
    }
 }
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -1,117 +0,0 @@
-use anyhow::Context;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use super::RawMetric;
-
-pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
-    // do not add context to each error, callsite will log with full path
-    let span = tracing::Span::current();
-    tokio::task::spawn_blocking(move || {
-        let _e = span.entered();
-
-        if let Some(parent) = path.parent() {
-            if let Err(e) = scan_and_delete_with_same_prefix(&path) {
-                tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
-            }
-        }
-
-        let mut file = std::fs::File::open(&*path)?;
-        let reader = std::io::BufReader::new(&mut file);
-        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
-    })
-    .await
-    .context("read metrics join error")
-    .and_then(|x| x)
-}
-
-fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
-    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
-
-    let prefix = path.file_name().expect("caller checked").to_string_lossy();
-
-    for entry in it {
-        let entry = entry?;
-        if !entry.metadata()?.is_file() {
-            continue;
-        }
-        let file_name = entry.file_name();
-
-        if path.file_name().unwrap() == file_name {
-            // do not remove our actual file
-            continue;
-        }
-
-        let file_name = file_name.to_string_lossy();
-
-        if !file_name.starts_with(&*prefix) {
-            continue;
-        }
-
-        let path = entry.path();
-
-        if let Err(e) = std::fs::remove_file(&path) {
-            tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
-        } else {
-            tracing::info!("cleaned up old tempfile {file_name:?}");
-        }
-    }
-
-    Ok(())
-}
-
-pub(super) async fn flush_metrics_to_disk(
-    current_metrics: &Arc<Vec<RawMetric>>,
-    path: &Arc<PathBuf>,
-) -> anyhow::Result<()> {
-    use std::io::Write;
-
-    anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
-    anyhow::ensure!(
-        path.file_name().is_some(),
-        "path must have filename: {path:?}"
-    );
-
-    let span = tracing::Span::current();
-    tokio::task::spawn_blocking({
-        let current_metrics = current_metrics.clone();
-        let path = path.clone();
-        move || {
-            let _e = span.entered();
-
-            let parent = path.parent().expect("existence checked");
-            let file_name = path.file_name().expect("existence checked");
-            let mut tempfile = tempfile::Builder::new()
-                .prefix(file_name)
-                .suffix(".tmp")
-                .tempfile_in(parent)?;
-
-            tracing::debug!("using tempfile {:?}", tempfile.path());
-
-            // write out all of the raw metrics, to be read out later on restart as cached values
-            {
-                let mut writer = std::io::BufWriter::new(&mut tempfile);
-                serde_json::to_writer(&mut writer, &*current_metrics)
-                    .context("serialize metrics")?;
-                writer
-                    .into_inner()
-                    .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
-            }
-
-            tempfile.flush()?;
-            tempfile.as_file().sync_all()?;
-
-            fail::fail_point!("before-persist-last-metrics-collected");
-
-            drop(tempfile.persist(&*path).map_err(|e| e.error)?);
-
-            let f = std::fs::File::open(path.parent().unwrap())?;
-            f.sync_all()?;
-
-            anyhow::Ok(())
-        }
-    })
-    .await
-    .with_context(|| format!("write metrics to {path:?} join error"))
-    .and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
-}
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,455 +0,0 @@
-use crate::context::RequestContext;
-use anyhow::Context;
-use chrono::{DateTime, Utc};
-use consumption_metrics::EventType;
-use futures::stream::StreamExt;
-use serde_with::serde_as;
-use std::{sync::Arc, time::SystemTime};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-use super::{Cache, RawMetric};
-
-/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
-/// instead of static str.
-// Do not rename any of these without first consulting with data team and partner
-// management.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
-pub(super) enum Name {
-    /// Timeline last_record_lsn, absolute
-    #[serde(rename = "written_size")]
-    WrittenSize,
-    /// Timeline last_record_lsn, incremental
-    #[serde(rename = "written_data_bytes_delta")]
-    WrittenSizeDelta,
-    /// Timeline logical size
-    #[serde(rename = "timeline_logical_size")]
-    LogicalSize,
-    /// Tenant remote size
-    #[serde(rename = "remote_storage_size")]
-    RemoteSize,
-    /// Tenant resident size
-    #[serde(rename = "resident_size")]
-    ResidentSize,
-    /// Tenant synthetic size
-    #[serde(rename = "synthetic_storage_size")]
-    SyntheticSize,
-}
-
-/// Key that uniquely identifies the object this metric describes.
-///
-/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
-/// elsewhere.
-#[serde_with::serde_as]
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
-pub(crate) struct MetricsKey {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub(super) tenant_id: TenantId,
-
-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub(super) timeline_id: Option<TimelineId>,
-
-    pub(super) metric: Name,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    const fn from_until(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> RawMetric {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        let when = EventType::Incremental {
-            start_time: prev_end,
-            stop_time: up_to,
-        };
-        (key, (when, val))
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: Name::WrittenSize,
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: Name::WrittenSizeDelta,
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: Name::LogicalSize,
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: Name::RemoteSize,
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: Name::ResidentSize,
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: Name::SyntheticSize,
-        }
-        .absolute_values()
-    }
-}
-
-pub(super) async fn collect_all_metrics(
-    cached_metrics: &Cache,
-    ctx: &RequestContext,
-) -> Vec<RawMetric> {
-    use pageserver_api::models::TenantState;
-
-    let started_at = std::time::Instant::now();
-
-    let tenants = match crate::tenant::mgr::list_tenants().await {
-        Ok(tenants) => tenants,
-        Err(err) => {
-            tracing::error!("failed to list tenants: {:?}", err);
-            return vec![];
-        }
-    };
-
-    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active {
-            None
-        } else {
-            crate::tenant::mgr::get_tenant(id, true)
-                .await
-                .ok()
-                .map(|tenant| (id, tenant))
-        }
-    });
-
-    let res = collect(tenants, cached_metrics, ctx).await;
-
-    tracing::info!(
-        elapsed_ms = started_at.elapsed().as_millis(),
-        total = res.len(),
-        "collected metrics"
-    );
-
-    res
-}
-
-async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
-where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
-{
-    let mut current_metrics: Vec<RawMetric> = Vec::new();
-
-    let mut tenants = std::pin::pin!(tenants);
-
-    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let mut tenant_resident_size = 0;
-
-        for timeline in tenant.list_timelines() {
-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
-                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cache,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    tracing::error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
-            }
-
-            tenant_resident_size += timeline.resident_physical_size();
-        }
-
-        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
-        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
-    }
-
-    current_metrics
-}
-
-/// In-between abstraction to allow testing metrics without actual Tenants.
-struct TenantSnapshot {
-    resident_size: u64,
-    remote_size: u64,
-    synthetic_size: u64,
-}
-
-impl TenantSnapshot {
-    /// Collect tenant status to have metrics created out of it.
-    ///
-    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
-    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
-        TenantSnapshot {
-            resident_size,
-            remote_size: t.remote_size(),
-            // Note that this metric is calculated in a separate bgworker
-            // Here we only use cached value, which may lag behind the real latest one
-            synthetic_size: t.cached_synthetic_size(),
-        }
-    }
-
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        now: DateTime<Utc>,
-        cached: &Cache,
-        metrics: &mut Vec<RawMetric>,
-    ) {
-        let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
-
-        let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
-
-        let synthetic_size = {
-            let factory = MetricsKey::synthetic_size(tenant_id);
-            let mut synthetic_size = self.synthetic_size;
-
-            if synthetic_size == 0 {
-                if let Some((_, value)) = cached.get(factory.key()) {
-                    // use the latest value from previous session
-                    synthetic_size = *value;
-                }
-            }
-
-            if synthetic_size != 0 {
-                // only send non-zeroes because otherwise these show up as errors in logs
-                Some(factory.at(now, synthetic_size))
-            } else {
-                None
-            }
-        };
-
-        metrics.extend(
-            [Some(remote_size), Some(resident_size), synthetic_size]
-                .into_iter()
-                .flatten(),
-        );
-    }
-}
-
-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
-
-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
-
-            let current_exact_logical_size = {
-                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
-
-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<RawMetric>,
-        cache: &Cache,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        let up_to = now;
-
-        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
-            // written_size_delta
-            metrics.push(key_value);
-            // written_size
-            metrics.push((key, written_size_now));
-        } else {
-            // the cached value was ahead of us, report zero until we've caught up
-            metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
-            // the cached value was ahead of us, report the same until we've caught up
-            metrics.push((key, (written_size_now.0, prev.1)));
-        }
-
-        {
-            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
-            let current_or_previous = self
-                .current_exact_logical_size
-                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
-
-            if let Some(size) = current_or_previous {
-                metrics.push(factory.at(now, size));
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests;
-
-#[cfg(test)]
-pub(crate) use tests::metric_examples;
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,297 +0,0 @@
-use super::*;
-use std::collections::HashMap;
-use std::time::SystemTime;
-use utils::lsn::Lsn;
-
-#[test]
-fn startup_collected_timeline_metrics_before_advancing() {
-    let tenant_id = TenantId::generate();
-    let timeline_id = TimelineId::generate();
-
-    let mut metrics = Vec::new();
-    let cache = HashMap::new();
-
-    let initdb_lsn = Lsn(0x10000);
-    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-    let snap = TimelineSnapshot {
-        loaded_at: (disk_consistent_lsn, SystemTime::now()),
-        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
-    };
-
-    let now = DateTime::<Utc>::from(SystemTime::now());
-
-    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
-                snap.loaded_at.1.into(),
-                now,
-                0
-            ),
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-        ]
-    );
-}
-
-#[test]
-fn startup_collected_timeline_metrics_second_round() {
-    let tenant_id = TenantId::generate();
-    let timeline_id = TimelineId::generate();
-
-    let [now, before, init] = time_backwards();
-
-    let now = DateTime::<Utc>::from(now);
-    let before = DateTime::<Utc>::from(before);
-
-    let initdb_lsn = Lsn(0x10000);
-    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-    let mut metrics = Vec::new();
-    let cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-    ]);
-
-    let snap = TimelineSnapshot {
-        loaded_at: (disk_consistent_lsn, init),
-        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
-    };
-
-    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-        ]
-    );
-}
-
-#[test]
-fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-    let tenant_id = TenantId::generate();
-    let timeline_id = TimelineId::generate();
-
-    let [now, just_before, before, init] = time_backwards();
-
-    let now = DateTime::<Utc>::from(now);
-    let just_before = DateTime::<Utc>::from(just_before);
-    let before = DateTime::<Utc>::from(before);
-
-    let initdb_lsn = Lsn(0x10000);
-    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-    let mut metrics = Vec::new();
-    let cache = HashMap::from([
-        // at t=before was the last time the last_record_lsn changed
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-        // end time of this event is used for the next ones
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
-    ]);
-
-    let snap = TimelineSnapshot {
-        loaded_at: (disk_consistent_lsn, init),
-        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
-    };
-
-    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-        ]
-    );
-}
-
-#[test]
-fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
-    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
-    // should never go backwards
-
-    let tenant_id = TenantId::generate();
-    let timeline_id = TimelineId::generate();
-
-    let [later, now, at_restart] = time_backwards();
-
-    // FIXME: tests would be so much easier if we did not need to juggle back and forth
-    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
-    let now = DateTime::<Utc>::from(now);
-    let later = DateTime::<Utc>::from(later);
-    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
-    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
-    let before_restart = DateTime::<Utc>::from(before_restart);
-    let way_before = DateTime::<Utc>::from(way_before);
-
-    let snap = TimelineSnapshot {
-        loaded_at: (Lsn(50), at_restart),
-        last_record_lsn: Lsn(50),
-        current_exact_logical_size: None,
-    };
-
-    let mut cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
-            way_before,
-            before_restart,
-            // not taken into account, but the timestamps are important
-            999_999_999,
-        ),
-    ]);
-
-    let mut metrics = Vec::new();
-    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
-                before_restart,
-                now,
-                0
-            ),
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
-        ]
-    );
-
-    // now if we cache these metrics, and re-run while "still in recovery"
-    cache.extend(metrics.drain(..));
-
-    // "still in recovery", because our snapshot did not change
-    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
-            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
-        ]
-    );
-}
-
-#[test]
-fn post_restart_current_exact_logical_size_uses_cached() {
-    let tenant_id = TenantId::generate();
-    let timeline_id = TimelineId::generate();
-
-    let [now, at_restart] = time_backwards();
-
-    let now = DateTime::<Utc>::from(now);
-    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
-    let before_restart = DateTime::<Utc>::from(before_restart);
-
-    let snap = TimelineSnapshot {
-        loaded_at: (Lsn(50), at_restart),
-        last_record_lsn: Lsn(50),
-        current_exact_logical_size: None,
-    };
-
-    let cache = HashMap::from([
-        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
-    ]);
-
-    let mut metrics = Vec::new();
-    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
-
-    assert_eq!(
-        metrics,
-        &[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)]
-    );
-}
-
-#[test]
-fn post_restart_synthetic_size_uses_cached_if_available() {
-    let tenant_id = TenantId::generate();
-
-    let ts = TenantSnapshot {
-        resident_size: 1000,
-        remote_size: 1000,
-        // not yet calculated
-        synthetic_size: 0,
-    };
-
-    let now = SystemTime::now();
-    let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
-    let now = DateTime::<Utc>::from(now);
-
-    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
-
-    let mut metrics = Vec::new();
-    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
-            MetricsKey::synthetic_size(tenant_id).at(now, 1000),
-        ]
-    );
-}
-
-#[test]
-fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
-    let tenant_id = TenantId::generate();
-
-    let ts = TenantSnapshot {
-        resident_size: 1000,
-        remote_size: 1000,
-        // not yet calculated
-        synthetic_size: 0,
-    };
-
-    let now = SystemTime::now();
-    let now = DateTime::<Utc>::from(now);
-
-    let cached = HashMap::new();
-
-    let mut metrics = Vec::new();
-    ts.to_metrics(tenant_id, now, &cached, &mut metrics);
-
-    assert_eq!(
-        metrics,
-        &[
-            MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
-            MetricsKey::resident_size(tenant_id).at(now, 1000),
-            // no synthetic size here
-        ]
-    );
-}
-
-fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-    let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-    times[0] = std::time::SystemTime::now();
-    for behind in 1..N {
-        times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-    }
-
-    times
-}
-
-pub(crate) const fn metric_examples(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    now: DateTime<Utc>,
-    before: DateTime<Utc>,
-) -> [RawMetric; 6] {
-    [
-        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
-        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
-        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
-        MetricsKey::resident_size(tenant_id).at(now, 0),
-        MetricsKey::synthetic_size(tenant_id).at(now, 1),
-    ]
-}
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,443 +0,0 @@
-use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
-use serde_with::serde_as;
-use tokio_util::sync::CancellationToken;
-use tracing::Instrument;
-
-use super::{metrics::Name, Cache, MetricsKey, RawMetric};
-use utils::id::{TenantId, TimelineId};
-
-/// How the metrics from pageserver are identified.
-#[serde_with::serde_as]
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
-struct Ids {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    pub(super) tenant_id: TenantId,
-    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub(super) timeline_id: Option<TimelineId>,
-}
-
-#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
-pub(super) async fn upload_metrics(
-    client: &reqwest::Client,
-    metric_collection_endpoint: &reqwest::Url,
-    cancel: &CancellationToken,
-    node_id: &str,
-    metrics: &[RawMetric],
-    cached_metrics: &mut Cache,
-) -> anyhow::Result<()> {
-    let mut uploaded = 0;
-    let mut failed = 0;
-
-    let started_at = std::time::Instant::now();
-
-    let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
-
-    while let Some(res) = iter.next() {
-        let (chunk, body) = res?;
-
-        let event_bytes = body.len();
-
-        let is_last = iter.len() == 0;
-
-        let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
-            .instrument(tracing::info_span!(
-                "upload",
-                %event_bytes,
-                uploaded,
-                total = metrics.len(),
-            ))
-            .await;
-
-        match res {
-            Ok(()) => {
-                for (curr_key, curr_val) in chunk {
-                    cached_metrics.insert(*curr_key, *curr_val);
-                }
-                uploaded += chunk.len();
-            }
-            Err(_) => {
-                // failure(s) have already been logged
-                //
-                // however this is an inconsistency: if we crash here, we will start with the
-                // values as uploaded. in practice, the rejections no longer happen.
-                failed += chunk.len();
-            }
-        }
-    }
-
-    let elapsed = started_at.elapsed();
-
-    tracing::info!(
-        uploaded,
-        failed,
-        elapsed_ms = elapsed.as_millis(),
-        "done sending metrics"
-    );
-
-    Ok(())
-}
-
-// The return type is quite ugly, but we gain testability in isolation
-fn serialize_in_chunks<'a, F>(
-    chunk_size: usize,
-    input: &'a [RawMetric],
-    factory: F,
-) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
-where
-    F: KeyGen<'a> + 'a,
-{
-    use bytes::BufMut;
-
-    struct Iter<'a, F> {
-        inner: std::slice::Chunks<'a, RawMetric>,
-        chunk_size: usize,
-
-        // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
-        buffer: bytes::BytesMut,
-        // chunk amount of events are reused to produce the serialized document
-        scratch: Vec<Event<Ids, Name>>,
-        factory: F,
-    }
-
-    impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
-        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
-
-        fn next(&mut self) -> Option<Self::Item> {
-            let chunk = self.inner.next()?;
-
-            if self.scratch.is_empty() {
-                // first round: create events with N strings
-                self.scratch.extend(
-                    chunk
-                        .iter()
-                        .map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
-                );
-            } else {
-                // next rounds: update_in_place to reuse allocations
-                assert_eq!(self.scratch.len(), self.chunk_size);
-                self.scratch
-                    .iter_mut()
-                    .zip(chunk.iter())
-                    .for_each(|(slot, raw_metric)| {
-                        raw_metric.update_in_place(slot, &self.factory.generate())
-                    });
-            }
-
-            let res = serde_json::to_writer(
-                (&mut self.buffer).writer(),
-                &EventChunk {
-                    events: (&self.scratch[..chunk.len()]).into(),
-                },
-            );
-
-            match res {
-                Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
-                Err(e) => Some(Err(e)),
-            }
-        }
-
-        fn size_hint(&self) -> (usize, Option<usize>) {
-            self.inner.size_hint()
-        }
-    }
-
-    impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
-
-    let buffer = bytes::BytesMut::new();
-    let inner = input.chunks(chunk_size);
-    let scratch = Vec::new();
-
-    Iter {
-        inner,
-        chunk_size,
-        buffer,
-        scratch,
-        factory,
-    }
-}
-
-trait RawMetricExt {
-    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
-    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
-}
-
-impl RawMetricExt for RawMetric {
-    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
-        let MetricsKey {
-            metric,
-            tenant_id,
-            timeline_id,
-        } = self.0;
-
-        let (kind, value) = self.1;
-
-        Event {
-            kind,
-            metric,
-            idempotency_key: key.to_string(),
-            value,
-            extra: Ids {
-                tenant_id,
-                timeline_id,
-            },
-        }
-    }
-
-    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
-        use std::fmt::Write;
-
-        let MetricsKey {
-            metric,
-            tenant_id,
-            timeline_id,
-        } = self.0;
-
-        let (kind, value) = self.1;
-
-        *event = Event {
-            kind,
-            metric,
-            idempotency_key: {
-                event.idempotency_key.clear();
-                write!(event.idempotency_key, "{key}").unwrap();
-                std::mem::take(&mut event.idempotency_key)
-            },
-            value,
-            extra: Ids {
-                tenant_id,
-                timeline_id,
-            },
-        };
-    }
-}
-
-trait KeyGen<'a>: Copy {
-    fn generate(&self) -> IdempotencyKey<'a>;
-}
-
-impl<'a> KeyGen<'a> for &'a str {
-    fn generate(&self) -> IdempotencyKey<'a> {
-        IdempotencyKey::generate(self)
-    }
-}
-
-enum UploadError {
-    Rejected(reqwest::StatusCode),
-    Reqwest(reqwest::Error),
-    Cancelled,
-}
-
-impl std::fmt::Debug for UploadError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // use same impl because backoff::retry will log this using both
-        std::fmt::Display::fmt(self, f)
-    }
-}
-
-impl std::fmt::Display for UploadError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use UploadError::*;
-
-        match self {
-            Rejected(code) => write!(f, "server rejected the metrics with {code}"),
-            Reqwest(e) => write!(f, "request failed: {e}"),
-            Cancelled => write!(f, "cancelled"),
-        }
-    }
-}
-
-impl UploadError {
-    fn is_reject(&self) -> bool {
-        matches!(self, UploadError::Rejected(_))
-    }
-}
-
-// this is consumed by the test verifiers
-static LAST_IN_BATCH: reqwest::header::HeaderName =
-    reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
-
-async fn upload(
-    client: &reqwest::Client,
-    metric_collection_endpoint: &reqwest::Url,
-    body: bytes::Bytes,
-    cancel: &CancellationToken,
-    is_last: bool,
-) -> Result<(), UploadError> {
-    let warn_after = 3;
-    let max_attempts = 10;
-    let res = utils::backoff::retry(
-        move || {
-            let body = body.clone();
-            async move {
-                let res = client
-                    .post(metric_collection_endpoint.clone())
-                    .header(reqwest::header::CONTENT_TYPE, "application/json")
-                    .header(
-                        LAST_IN_BATCH.clone(),
-                        if is_last { "true" } else { "false" },
-                    )
-                    .body(body)
-                    .send()
-                    .await;
-
-                let res = res.and_then(|res| res.error_for_status());
-
-                // 10 redirects are normally allowed, so we don't need worry about 3xx
-                match res {
-                    Ok(_response) => Ok(()),
-                    Err(e) => {
-                        let status = e.status().filter(|s| s.is_client_error());
-                        if let Some(status) = status {
-                            // rejection used to be a thing when the server could reject a
-                            // whole batch of metrics if one metric was bad.
-                            Err(UploadError::Rejected(status))
-                        } else {
-                            Err(UploadError::Reqwest(e))
-                        }
-                    }
-                }
-            }
-        },
-        UploadError::is_reject,
-        warn_after,
-        max_attempts,
-        "upload consumption_metrics",
-        utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
-    )
-    .await;
-
-    match &res {
-        Ok(_) => {}
-        Err(e) if e.is_reject() => {
-            // permanent errors currently do not get logged by backoff::retry
-            // display alternate has no effect, but keeping it here for easier pattern matching.
-            tracing::error!("failed to upload metrics: {e:#}");
-        }
-        Err(_) => {
-            // these have been logged already
-        }
-    }
-
-    res
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use chrono::{DateTime, Utc};
-    use once_cell::sync::Lazy;
-
-    #[test]
-    fn chunked_serialization() {
-        let examples = metric_samples();
-        assert!(examples.len() > 1);
-
-        let factory = FixedGen::new(Utc::now(), "1", 42);
-
-        // need to use Event here because serde_json::Value uses default hashmap, not linked
-        // hashmap
-        #[derive(serde::Deserialize)]
-        struct EventChunk {
-            events: Vec<Event<Ids, Name>>,
-        }
-
-        let correct = serialize_in_chunks(examples.len(), &examples, factory)
-            .map(|res| res.unwrap().1)
-            .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
-            .collect::<Vec<_>>();
-
-        for chunk_size in 1..examples.len() {
-            let actual = serialize_in_chunks(chunk_size, &examples, factory)
-                .map(|res| res.unwrap().1)
-                .flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
-                .collect::<Vec<_>>();
-
-            // if these are equal, it means that multi-chunking version works as well
-            assert_eq!(correct, actual);
-        }
-    }
-
-    #[derive(Clone, Copy)]
-    struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
-
-    impl<'a> FixedGen<'a> {
-        fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
-            FixedGen(now, node_id, nonce)
-        }
-    }
-
-    impl<'a> KeyGen<'a> for FixedGen<'a> {
-        fn generate(&self) -> IdempotencyKey<'a> {
-            IdempotencyKey::for_tests(self.0, self.1, self.2)
-        }
-    }
-
-    static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
-        DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
-            .unwrap()
-            .into()
-    });
-
-    #[test]
-    fn metric_image_stability() {
-        // it is important that these strings stay as they are
-
-        let examples = [
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
-            ),
-            (
-                line!(),
-                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
-            ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
-            ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
-            ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
-            ),
-            (
-                line!(),
-                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
-            ),
-        ];
-
-        let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
-        let examples = examples.into_iter().zip(metric_samples());
-
-        for ((line, expected), (key, (kind, value))) in examples {
-            let e = consumption_metrics::Event {
-                kind,
-                metric: key.metric,
-                idempotency_key: idempotency_key.to_string(),
-                value,
-                extra: Ids {
-                    tenant_id: key.tenant_id,
-                    timeline_id: key.timeline_id,
-                },
-            };
-            let actual = serde_json::to_string(&e).unwrap();
-            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
-        }
-    }
-
-    fn metric_samples() -> [RawMetric; 6] {
-        let tenant_id = TenantId::from_array([0; 16]);
-        let timeline_id = TimelineId::from_array([0xff; 16]);
-
-        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
-            .unwrap()
-            .into();
-        let [now, before] = [*SAMPLES_NOW, before];
-
-        super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
-    }
-}
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -444,7 +444,6 @@ impl<'a> WalIngest<'a> {
        // need to clear the corresponding bits in the visibility map.
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

        match self.timeline.pg_version {
            14 => {
@@ -471,20 +470,14 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v14::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                            new_heap_blkno = Some(decoded.blocks[1].blkno);
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -504,12 +497,6 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -539,20 +526,14 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v15::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                            new_heap_blkno = Some(decoded.blocks[1].blkno);
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -572,12 +553,6 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -607,20 +582,14 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v16::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                            new_heap_blkno = Some(decoded.blocks[1].blkno);
                        }
                    }
                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -640,12 +609,6 @@ impl<'a> WalIngest<'a> {
                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
                    }
                } else {
                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -653,6 +616,7 @@ impl<'a> WalIngest<'a> {
            }
            _ => {}
        }
+        // FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?

        // Clear the VM bits if required.
        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -696,7 +660,7 @@ impl<'a> WalIngest<'a> {
                        NeonWalRecord::ClearVisibilityMapFlags {
                            new_heap_blkno,
                            old_heap_blkno,
-                            flags,
+                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                        },
                        ctx,
                    )
@@ -712,7 +676,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno,
                                old_heap_blkno: None,
-                                flags,
+                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                            },
                            ctx,
                        )
@@ -726,7 +690,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno: None,
                                old_heap_blkno,
-                                flags,
+                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                            },
                            ctx,
                        )
@@ -753,8 +717,6 @@ impl<'a> WalIngest<'a> {
        // need to clear the corresponding bits in the visibility map.
        let mut new_heap_blkno: Option<u32> = None;
        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match self.timeline.pg_version {
@@ -783,14 +745,14 @@ impl<'a> WalIngest<'a> {
                        // we can't validate the remaining number of bytes without parsing
                        // the tuple data.
                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
                        }
                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                            // non-HOT update where the new tuple goes to different page than
                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                            new_heap_blkno = Some(decoded.blocks[1].blkno);
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
@@ -810,11 +772,7 @@ impl<'a> WalIngest<'a> {
                        }
                    }
                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
+                        /* XLOG_NEON_HEAP_LOCK doesn't need special care */
                    }
                    info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
                }
@@ -825,6 +783,8 @@ impl<'a> WalIngest<'a> {
            ),
        }

+        // FIXME: What about XLOG_NEON_HEAP_LOCK?
+
        // Clear the VM bits if required.
        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
            let vm_rel = RelTag {
@@ -867,7 +827,7 @@ impl<'a> WalIngest<'a> {
                        NeonWalRecord::ClearVisibilityMapFlags {
                            new_heap_blkno,
                            old_heap_blkno,
-                            flags,
+                            flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                        },
                        ctx,
                    )
@@ -883,7 +843,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno,
                                old_heap_blkno: None,
-                                flags,
+                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                            },
                            ctx,
                        )
@@ -897,7 +857,7 @@ impl<'a> WalIngest<'a> {
                            NeonWalRecord::ClearVisibilityMapFlags {
                                new_heap_blkno: None,
                                old_heap_blkno,
-                                flags,
+                                flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                            },
                            ctx,
                        )
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -219,66 +219,20 @@ pub mod v14 {
                old_offnum: buf.get_u16_le(),
                old_infobits_set: buf.get_u8(),
                flags: buf.get_u8(),
-                t_cid: buf.get_u32_le(),
+                t_cid: buf.get_u32(),
                new_xmax: buf.get_u32_le(),
                new_offnum: buf.get_u16_le(),
            }
        }
    }
-
-    #[repr(C)]
-    #[derive(Debug)]
-    pub struct XlHeapLock {
-        pub locking_xid: TransactionId,
-        pub offnum: OffsetNumber,
-        pub _padding: u16,
-        pub t_cid: u32,
-        pub infobits_set: u8,
-        pub flags: u8,
-    }
-
-    impl XlHeapLock {
-        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
-            XlHeapLock {
-                locking_xid: buf.get_u32_le(),
-                offnum: buf.get_u16_le(),
-                _padding: buf.get_u16_le(),
-                t_cid: buf.get_u32_le(),
-                infobits_set: buf.get_u8(),
-                flags: buf.get_u8(),
-            }
-        }
-    }
-
-    #[repr(C)]
-    #[derive(Debug)]
-    pub struct XlHeapLockUpdated {
-        pub xmax: TransactionId,
-        pub offnum: OffsetNumber,
-        pub infobits_set: u8,
-        pub flags: u8,
-    }
-
-    impl XlHeapLockUpdated {
-        pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated {
-            XlHeapLockUpdated {
-                xmax: buf.get_u32_le(),
-                offnum: buf.get_u16_le(),
-                infobits_set: buf.get_u8(),
-                flags: buf.get_u8(),
-            }
-        }
-    }
 }

 pub mod v15 {
-    pub use super::v14::{
-        XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
-    };
+    pub use super::v14::{XlHeapDelete, XlHeapInsert, XlHeapMultiInsert, XlHeapUpdate};
 }

 pub mod v16 {
-    pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
+    pub use super::v14::{XlHeapInsert, XlHeapMultiInsert};
    use bytes::{Buf, Bytes};
    use postgres_ffi::{OffsetNumber, TransactionId};

@@ -324,26 +278,6 @@ pub mod v16 {
        }
    }

-    #[repr(C)]
-    #[derive(Debug)]
-    pub struct XlHeapLock {
-        pub locking_xid: TransactionId,
-        pub offnum: OffsetNumber,
-        pub infobits_set: u8,
-        pub flags: u8,
-    }
-
-    impl XlHeapLock {
-        pub fn decode(buf: &mut Bytes) -> XlHeapLock {
-            XlHeapLock {
-                locking_xid: buf.get_u32_le(),
-                offnum: buf.get_u16_le(),
-                infobits_set: buf.get_u8(),
-                flags: buf.get_u8(),
-            }
-        }
-    }
-
    /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
    pub mod rm_neon {
        use bytes::{Buf, Bytes};
@@ -432,28 +366,6 @@ pub mod v16 {
                }
            }
        }
-
-        #[repr(C)]
-        #[derive(Debug)]
-        pub struct XlNeonHeapLock {
-            pub locking_xid: TransactionId,
-            pub t_cid: u32,
-            pub offnum: OffsetNumber,
-            pub infobits_set: u8,
-            pub flags: u8,
-        }
-
-        impl XlNeonHeapLock {
-            pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock {
-                XlNeonHeapLock {
-                    locking_xid: buf.get_u32_le(),
-                    t_cid: buf.get_u32_le(),
-                    offnum: buf.get_u16_le(),
-                    infobits_set: buf.get_u8(),
-                    flags: buf.get_u8(),
-                }
-            }
-        }
    }
 }

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -153,7 +153,7 @@ lfc_ensure_opened(void)
 			return false;
 		}
 	}
-	return true;
+	return false;
 }

 static void
@@ -222,9 +222,8 @@ lfc_change_limit_hook(int newval, void *extra)
 	/*
 	 * Stats collector detach shared memory, so we should not try to access shared memory here.
 	 * Parallel workers first assign default value (0), so not perform truncation in parallel workers.
-	 * The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
 	 */
-	if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
+	if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
 		return;

 	/* Open cache file if not done yet */
@@ -641,7 +640,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 	if (rc != BLCKSZ)
 	{
-		LWLockRelease(lfc_lock);
 		lfc_disable("write");
 	}
 	else
@@ -652,8 +650,9 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);

 		entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
-		LWLockRelease(lfc_lock);
 	}
+
+	LWLockRelease(lfc_lock);
 }

 /*
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -121,7 +121,7 @@ async fn collect_metrics_iteration(

    let current_metrics = gather_proxy_io_bytes_per_client();

-    let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
+    let metrics_to_send: Vec<Event<Ids>> = current_metrics
        .iter()
        .filter_map(|(curr_key, (curr_val, curr_time))| {
            let mut start_time = *curr_time;
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,7 +1,6 @@
 pytest_plugins = (
    "fixtures.pg_version",
    "fixtures.parametrize",
-    "fixtures.httpserver",
    "fixtures.neon_fixtures",
    "fixtures.benchmark_fixture",
    "fixtures.pg_stats",
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -1,45 +0,0 @@
-from typing import Tuple
-
-import pytest
-from pytest_httpserver import HTTPServer
-
-# TODO: mypy fails with:
-#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
-# from fixtures.neon_fixtures import PortDistributor
-
-# compared to the fixtures from pytest_httpserver with same names, these are
-# always function scoped, so you can check and stop the server in tests.
-
-
-@pytest.fixture(scope="function")
-def httpserver_ssl_context():
-    return None
-
-
-@pytest.fixture(scope="function")
-def make_httpserver(httpserver_listen_address, httpserver_ssl_context):
-    host, port = httpserver_listen_address
-    if not host:
-        host = HTTPServer.DEFAULT_LISTEN_HOST
-    if not port:
-        port = HTTPServer.DEFAULT_LISTEN_PORT
-
-    server = HTTPServer(host=host, port=port, ssl_context=httpserver_ssl_context)
-    server.start()
-    yield server
-    server.clear()
-    if server.is_running():
-        server.stop()
-
-
-@pytest.fixture(scope="function")
-def httpserver(make_httpserver):
-    server = make_httpserver
-    yield server
-    server.clear()
-
-
-@pytest.fixture(scope="function")
-def httpserver_listen_address(port_distributor) -> Tuple[str, int]:
-    port = port_distributor.get_port()
-    return ("localhost", port)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -223,6 +223,12 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)


+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
@pytest.fixture(scope="function")
 def default_broker(
    port_distributor: PortDistributor,
@@ -847,6 +853,18 @@ class NeonEnv:
        """Get list of safekeeper endpoints suitable for safekeepers GUC"""
        return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)

+    def timeline_dir(
+        self, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None
+    ) -> Path:
+        """Get a timeline directory's path based on the repo directory of the test environment"""
+        return (
+            self.tenant_dir(tenant_id, pageserver_id=pageserver_id) / "timelines" / str(timeline_id)
+        )
+
+    def tenant_dir(self, tenant_id: TenantId, pageserver_id: Optional[int] = None) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        return self.get_pageserver(pageserver_id).workdir / "tenants" / str(tenant_id)
+
    def get_pageserver_version(self) -> str:
        bin_pageserver = str(self.neon_binpath / "pageserver")
        res = subprocess.run(
@@ -1568,21 +1586,6 @@ class NeonPageserver(PgProtocol):
            '.*registered custom resource manager "neon".*',
        ]

-    def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
-        """Get a timeline directory's path based on the repo directory of the test environment"""
-        if timeline_id is None:
-            return self.tenant_dir(tenant_id) / "timelines"
-        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
-
-    def tenant_dir(
-        self,
-        tenant_id: Optional[TenantId] = None,
-    ) -> Path:
-        """Get a tenant directory's path based on the repo directory of the test environment"""
-        if tenant_id is None:
-            return self.workdir / "tenants"
-        return self.workdir / "tenants" / str(tenant_id)
-
    def start(
        self,
        overrides: Tuple[str, ...] = (),
@@ -2133,28 +2136,6 @@ class NeonProxy(PgProtocol):
    def _wait_until_ready(self):
        requests.get(f"http://{self.host}:{self.http_port}/v1/status")

-    def http_query(self, query, args, **kwargs):
-        # TODO maybe use default values if not provided
-        user = kwargs["user"]
-        password = kwargs["password"]
-        expected_code = kwargs.get("expected_code")
-
-        connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
-        response = requests.post(
-            f"https://{self.domain}:{self.external_http_port}/sql",
-            data=json.dumps({"query": query, "params": args}),
-            headers={
-                "Content-Type": "application/sql",
-                "Neon-Connection-String": connstr,
-                "Neon-Pool-Opt-In": "true",
-            },
-            verify=str(self.test_output_dir / "proxy.crt"),
-        )
-
-        if expected_code is not None:
-            assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
-        return response.json()
-
    def get_metrics(self) -> str:
        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
        request_result.raise_for_status()
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -236,27 +236,15 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
    response = list_prefix(neon_env_builder, prefix)
    keys = response["KeyCount"]
    objects = response.get("Contents", [])
-    common_prefixes = response.get("CommonPrefixes", [])

-    remote_storage = neon_env_builder.pageserver_remote_storage
-    is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
-
-    if is_mock_s3:
-        if keys == 1 and len(objects) == 0 and len(common_prefixes) == 1:
-            # this has been seen in the wild by tests with the below contradicting logging
-            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
-            # this seems like a mock_s3 issue
-            log.warn(
-                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
-            )
-            keys = 0
-        elif keys != 0 and len(objects) == 0:
-            # this has been seen in one case with mock_s3:
-            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
-            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warn(
-                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
-            )
+    if keys != 0 and len(objects) == 0:
+        # this has been seen in one case with mock_s3:
+        # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
+        # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
+        common_prefixes = response.get("CommonPrefixes", [])
+        log.warn(
+            f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
+        )

    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"

--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -115,8 +115,6 @@ class S3Storage:
    prefix_in_bucket: str
    client: S3Client
    cleanup: bool
-    """Is this MOCK_S3 (false) or REAL_S3 (true)"""
-    real: bool
    endpoint: Optional[str] = None

    def access_env_vars(self) -> Dict[str, str]:
@@ -267,7 +265,6 @@ class RemoteStorageKind(str, enum.Enum):
                prefix_in_bucket="",
                client=client,
                cleanup=False,
-                real=False,
            )

        assert self == RemoteStorageKind.REAL_S3
@@ -303,7 +300,6 @@ class RemoteStorageKind(str, enum.Enum):
            prefix_in_bucket=prefix_in_bucket,
            client=client,
            cleanup=True,
-            real=True,
        )


--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -44,7 +44,7 @@ def measure_recovery_time(env: NeonCompare):

    # Stop pageserver and remove tenant data
    env.env.pageserver.stop()
-    timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline)
+    timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
    shutil.rmtree(timeline_dir)

    # Start pageserver
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -135,7 +135,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)

    tenant_id = env.initial_tenant

-    timelines_dir = env.pageserver.timeline_dir(tenant_id)
+    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

@@ -166,7 +166,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde

    tenant_id = env.initial_tenant

-    timelines_dir = env.pageserver.timeline_dir(tenant_id)
+    timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
    old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
    initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -20,7 +20,7 @@ from fixtures.pageserver.utils import (
    wait_for_last_record_lsn,
    wait_for_upload,
 )
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser
 from fixtures.types import Lsn
@@ -151,6 +151,7 @@ def test_create_snapshot(
    shutil.copytree(test_output_dir, compatibility_snapshot_dir)


+@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -208,6 +209,7 @@ def test_backward_compatibility(
    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


+@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
--- a/test_runner/regress/test_createdropdb.py
+++ b/test_runner/regress/test_createdropdb.py
@@ -1,22 +1,16 @@
 import os
 import pathlib

-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
-from fixtures.pg_version import PgVersion
 from fixtures.utils import query_scalar


 #
 # Test CREATE DATABASE when there have been relmapper changes
 #
-@pytest.mark.parametrize("strategy", ["file_copy", "wal_log"])
-def test_createdb(neon_simple_env: NeonEnv, strategy: str):
+def test_createdb(neon_simple_env: NeonEnv):
    env = neon_simple_env
-    if env.pg_version == PgVersion.V14 and strategy == "wal_log":
-        pytest.skip("wal_log strategy not supported on PostgreSQL 14")
-
    env.neon_cli.create_branch("test_createdb", "empty")

    endpoint = env.endpoints.create_start("test_createdb")
@@ -26,10 +20,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
        # Cause a 'relmapper' change in the original branch
        cur.execute("VACUUM FULL pg_class")

-        if env.pg_version == PgVersion.V14:
-            cur.execute("CREATE DATABASE foodb")
-        else:
-            cur.execute(f"CREATE DATABASE foodb STRATEGY={strategy}")
+        cur.execute("CREATE DATABASE foodb")

        lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")

--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -42,11 +42,12 @@ def handle_role(dbs, roles, operation):
        raise ValueError("Invalid op")


-def ddl_forward_handler(
-    request: Request, dbs: Dict[str, str], roles: Dict[str, str], ddl: "DdlForwardingContext"
-) -> Response:
+fail = False
+
+
+def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
    log.info(f"Received request with data {request.get_data(as_text=True)}")
-    if ddl.fail:
+    if fail:
        log.info("FAILING")
        return Response(status=500, response="Failed just cuz")
    if request.json is None:
@@ -71,7 +72,6 @@ class DdlForwardingContext:
        self.port = port
        self.dbs: Dict[str, str] = {}
        self.roles: Dict[str, str] = {}
-        self.fail = False
        endpoint = "/management/api/v2/roles_and_databases"
        ddl_url = f"http://{host}:{port}{endpoint}"
        self.pg.configure(
@@ -82,7 +82,7 @@ class DdlForwardingContext:
        )
        log.info(f"Listening on {ddl_url}")
        self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
-            lambda request: ddl_forward_handler(request, self.dbs, self.roles, self)
+            lambda request: ddl_forward_handler(request, self.dbs, self.roles)
        )

    def __enter__(self):
@@ -103,9 +103,6 @@ class DdlForwardingContext:
    def wait(self, timeout=3):
        self.server.wait(timeout=timeout)

-    def failures(self, bool):
-        self.fail = bool
-
    def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
        res = self.send(query)
        self.wait(timeout=timeout)
@@ -206,9 +203,9 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
    assert ddl.dbs == {"stork": "cork"}

    with pytest.raises(psycopg2.InternalError):
-        ddl.failures(True)
+        global fail
+        fail = True
        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
        ddl.wait()

-    ddl.failures(False)
    conn.close()
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -417,7 +417,7 @@ def poor_mans_du(
    largest_layer = 0
    smallest_layer = None
    for tenant_id, timeline_id in timelines:
-        timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+        timeline_dir = env.timeline_dir(tenant_id, timeline_id)
        assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
        total = 0
        for file in timeline_dir.iterdir():
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -15,45 +15,45 @@ from fixtures.types import TimelineId

 # Test configuration
 #
-# Create a table with {NUM_ROWS} rows, and perform {UPDATES_TO_PERFORM} random
-# UPDATEs on it, using {NUM_CONNECTIONS} separate connections.
-NUM_CONNECTIONS = 10
-NUM_ROWS = 100000
-UPDATES_TO_PERFORM = 10000
+# Create a table with {num_rows} rows, and perform {updates_to_perform} random
+# UPDATEs on it, using {num_connections} separate connections.
+num_connections = 10
+num_rows = 100000
+updates_to_perform = 10000
+
+updates_performed = 0
+
+
+# Run random UPDATEs on test table
+async def update_table(endpoint: Endpoint):
+    global updates_performed
+    pg_conn = await endpoint.connect_async()
+
+    while updates_performed < updates_to_perform:
+        updates_performed += 1
+        id = random.randrange(1, num_rows)
+        await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
+
+
+# Perform aggressive GC with 0 horizon
+async def gc(env: NeonEnv, timeline: TimelineId):
+    pageserver_http = env.pageserver.http_client()
+
+    loop = asyncio.get_running_loop()
+
+    def do_gc():
+        pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
+        pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
+
+    with concurrent.futures.ThreadPoolExecutor() as pool:
+        while updates_performed < updates_to_perform:
+            await loop.run_in_executor(pool, do_gc)


 # At the same time, run UPDATEs and GC
 async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
    workers = []
-    updates_performed = 0
-
-    # Perform aggressive GC with 0 horizon
-    async def gc(env: NeonEnv, timeline: TimelineId):
-        pageserver_http = env.pageserver.http_client()
-        nonlocal updates_performed
-        global UPDATES_TO_PERFORM
-
-        loop = asyncio.get_running_loop()
-
-        def do_gc():
-            pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
-            pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
-
-        with concurrent.futures.ThreadPoolExecutor() as pool:
-            while updates_performed < UPDATES_TO_PERFORM:
-                await loop.run_in_executor(pool, do_gc)
-
-    # Run random UPDATEs on test table
-    async def update_table(endpoint: Endpoint):
-        pg_conn = await endpoint.connect_async()
-        nonlocal updates_performed
-
-        while updates_performed < UPDATES_TO_PERFORM:
-            updates_performed += 1
-            id = random.randrange(1, NUM_ROWS)
-            await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
-
-    for _ in range(NUM_CONNECTIONS):
+    for _ in range(num_connections):
        workers.append(asyncio.create_task(update_table(endpoint)))
    workers.append(asyncio.create_task(gc(env, timeline)))

@@ -81,7 +81,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
            f"""
            INSERT INTO foo
                SELECT g, 0, 'long string to consume some space' || g
-                FROM generate_series(1, {NUM_ROWS}) g
+                FROM generate_series(1, {num_rows}) g
        """
        )
        cur.execute("CREATE INDEX ON foo(id)")
@@ -91,7 +91,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
        cur.execute("SELECT COUNT(*), SUM(counter) FROM foo")
        r = cur.fetchone()
        assert r is not None
-        assert r == (NUM_ROWS, UPDATES_TO_PERFORM)
+        assert r == (num_rows, updates_to_perform)


 #
@@ -99,7 +99,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
 def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
    # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
-    num_index_uploads = 0

    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

@@ -161,5 +160,5 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
        log.info(f"{num_index_uploads} index uploads after GC iteration {i}")

    after = num_index_uploads
-    log.info(f"{after - before} new index uploads during test")
+    log.info(f"{after-before} new index uploads during test")
    assert after - before < 5
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -271,7 +271,7 @@ def _import(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -55,7 +55,7 @@ def test_basic_eviction(
    for sk in env.safekeepers:
        sk.stop()

-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.timeline_dir(tenant_id, timeline_id)
    initial_local_layers = sorted(
        list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
    )
@@ -243,7 +243,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
    assert by_kind["Image"] > 0
    assert by_kind["Delta"] > 0
    assert by_kind["InMemory"] == 0
-    resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+    resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
    log.info("resident layers count before eviction: %s", len(resident_layers))

    log.info("evict all layers")
@@ -251,7 +251,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):

    def ensure_resident_and_remote_size_metrics():
        log.info("ensure that all the layers are gone")
-        resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
+        resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
        # we have disabled all background loops, so, this should hold
        assert len(resident_layers) == 0

--- a/test_runner/regress/test_layer_writers_fail.py
+++ b/test_runner/regress/test_layer_writers_fail.py
@@ -38,7 +38,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
    new_temp_layer_files = list(
        filter(
            lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
-            [path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
+            [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
        )
    )

@@ -84,7 +84,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
    new_temp_layer_files = list(
        filter(
            lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
-            [path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
+            [path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
        )
    )

--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -1,44 +0,0 @@
-import threading
-import time
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, PgBin
-
-
-#
-# Test branching, when a transaction is in prepared state
-#
-@pytest.mark.timeout(600)
-def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
-    env = neon_simple_env
-    env.neon_cli.create_branch("test_lfc_resize", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_lfc_resize",
-        config_lines=[
-            "neon.file_cache_path='file.cache'",
-            "neon.max_file_cache_size=1GB",
-            "neon.file_cache_size_limit=1GB",
-        ],
-    )
-    n_resize = 10
-    scale = 10
-    log.info("postgres is running on 'test_lfc_resize' branch")
-
-    def run_pgbench(connstr: str):
-        log.info(f"Start a pgbench workload on pg {connstr}")
-        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
-        pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
-
-    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
-    thread.start()
-
-    conn = endpoint.connect()
-    cur = conn.cursor()
-
-    for i in range(n_resize):
-        cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
-        cur.execute("select pg_reload_conf()")
-        time.sleep(1)
-
-    thread.join()
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -0,0 +1,265 @@
+#
+# Test for collecting metrics from pageserver and proxy.
+# Use mock HTTP server to receive metrics and verify that they look sane.
+#
+
+import time
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    PSQL,
+    NeonEnvBuilder,
+    NeonProxy,
+    VanillaPostgres,
+    wait_for_last_flush_lsn,
+)
+from fixtures.port_distributor import PortDistributor
+from fixtures.remote_storage import RemoteStorageKind
+from fixtures.types import TenantId
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+# ==============================================================================
+# Storage metrics tests
+# ==============================================================================
+
+initial_tenant = TenantId.generate()
+remote_uploaded = 0
+checks = {
+    "written_size": lambda value: value > 0,
+    "resident_size": lambda value: value >= 0,
+    # >= 0 check here is to avoid race condition when we receive metrics before
+    # remote_uploaded is updated
+    "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+    # logical size may lag behind the actual size, so allow 0 here
+    "timeline_logical_size": lambda value: value >= 0,
+}
+
+metric_kinds_checked = set([])
+
+
+#
+# verify that metrics look minilally sane
+#
+def metrics_handler(request: Request) -> Response:
+    if request.json is None:
+        return Response(status=400)
+
+    events = request.json["events"]
+    log.info("received events:")
+    log.info(events)
+
+    for event in events:
+        assert event["tenant_id"] == str(
+            initial_tenant
+        ), "Expecting metrics only from the initial tenant"
+        metric_name = event["metric"]
+
+        check = checks.get(metric_name)
+        # calm down mypy
+        if check is not None:
+            assert check(event["value"]), f"{metric_name} isn't valid"
+            global metric_kinds_checked
+            metric_kinds_checked.add(metric_name)
+
+    return Response(status=200)
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
+)
+def test_metric_collection(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+    remote_storage_kind: RemoteStorageKind,
+):
+    (host, port) = httpserver_listen_address
+    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+
+    # Require collecting metrics frequently, since we change
+    # the timeline and want something to be logged about it.
+    #
+    # Disable time-based pitr, we will use the manual GC calls
+    # to trigger remote storage operations in a controlled way
+    neon_env_builder.pageserver_config_override = (
+        f"""
+        metric_collection_interval="1s"
+        metric_collection_endpoint="{metric_collection_endpoint}"
+    """
+        + "tenant_config={pitr_interval = '0 sec'}"
+    )
+
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
+
+    # Set initial tenant of the test, that we expect the logs from
+    global initial_tenant
+    initial_tenant = neon_env_builder.initial_tenant
+    # mock http server that returns OK for the metrics
+    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
+        metrics_handler
+    )
+
+    # spin up neon,  after http server is ready
+    env = neon_env_builder.init_start()
+    # Order of fixtures shutdown is not specified, and if http server gets down
+    # before pageserver, pageserver log might contain such errors in the end.
+    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_metric_collection")
+    endpoint = env.endpoints.create_start("test_metric_collection")
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+        SELECT g, 0, 'long string to consume some space' || g
+        FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Helper function that gets the number of given kind of remote ops from the metrics
+    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
+        ps_metrics = env.pageserver.http_client().get_metrics()
+        total = 0.0
+        for sample in ps_metrics.query_all(
+            name="pageserver_remote_operation_seconds_count",
+            filter={
+                "file_kind": str(file_kind),
+                "op_kind": str(op_kind),
+            },
+        ):
+            total += sample[2]
+        return int(total)
+
+    # upload some data to remote storage
+    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        pageserver_http = env.pageserver.http_client()
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
+        global remote_uploaded
+        remote_uploaded = get_num_remote_ops("index", "upload")
+        assert remote_uploaded > 0
+
+    # wait longer than collecting interval and check that all requests are served
+    time.sleep(3)
+    httpserver.check()
+    global metric_kinds_checked, checks
+    expected_checks = set(checks.keys())
+    assert len(metric_kinds_checked) == len(
+        checks
+    ), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"
+
+
+# ==============================================================================
+# Proxy metrics tests
+# ==============================================================================
+
+
+def proxy_metrics_handler(request: Request) -> Response:
+    if request.json is None:
+        return Response(status=400)
+
+    events = request.json["events"]
+    log.info("received events:")
+    log.info(events)
+
+    # perform basic sanity checks
+    for event in events:
+        assert event["metric"] == "proxy_io_bytes_per_client"
+        assert event["endpoint_id"] == "test_endpoint_id"
+        assert event["value"] >= 0
+        assert event["stop_time"] >= event["start_time"]
+
+    return Response(status=200)
+
+
+@pytest.fixture(scope="function")
+def proxy_with_metric_collector(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    httpserver_listen_address,
+    test_output_dir: Path,
+) -> Iterator[NeonProxy]:
+    """Neon proxy that routes through link auth and has metric collection enabled."""
+
+    http_port = port_distributor.get_port()
+    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+
+    (host, port) = httpserver_listen_address
+    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
+    metric_collection_interval = "5s"
+
+    with NeonProxy(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        proxy_port=proxy_port,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        metric_collection_endpoint=metric_collection_endpoint,
+        metric_collection_interval=metric_collection_interval,
+        auth_backend=NeonProxy.Link(),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
+@pytest.mark.asyncio
+async def test_proxy_metric_collection(
+    httpserver: HTTPServer,
+    proxy_with_metric_collector: NeonProxy,
+    vanilla_pg: VanillaPostgres,
+):
+    # mock http server that returns OK for the metrics
+    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
+        proxy_metrics_handler
+    )
+
+    # do something to generate load to generate metrics
+    # sleep for 5 seconds to give metric collector time to collect metrics
+    psql = await PSQL(
+        host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
+    ).run(
+        "create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42"
+    )
+
+    base_uri = proxy_with_metric_collector.link_auth_uri
+    link = await NeonProxy.find_auth_link(base_uri, psql)
+
+    psql_session_id = NeonProxy.get_session_id(base_uri, link)
+    await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id)
+
+    assert psql.stdout is not None
+    out = (await psql.stdout.read()).decode("utf-8").strip()
+    assert out == "42"
+
+    # do something to generate load to generate metrics
+    # sleep for 5 seconds to give metric collector time to collect metrics
+    psql = await PSQL(
+        host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
+    ).run("insert into tbl select * from generate_series(0,1000);  select pg_sleep(5); select 42")
+
+    link = await NeonProxy.find_auth_link(base_uri, psql)
+    psql_session_id = NeonProxy.get_session_id(base_uri, link)
+    await NeonProxy.activate_link_auth(
+        vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False
+    )
+
+    assert psql.stdout is not None
+    out = (await psql.stdout.read()).decode("utf-8").strip()
+    assert out == "42"
+
+    httpserver.check()
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -3,6 +3,7 @@

 import time
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, DefaultDict, Dict, Tuple

 import pytest
@@ -114,7 +115,7 @@ def test_ondemand_download_large_rel(
    env.pageserver.stop()

    # remove all the layer files
-    for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
+    for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
        log.info(f"unlinking layer {layer}")
        layer.unlink()

@@ -236,7 +237,7 @@ def test_ondemand_download_timetravel(
    env.pageserver.stop()

    # remove all the layer files
-    for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
+    for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
        log.info(f"unlinking layer {layer}")
        layer.unlink()

@@ -300,7 +301,6 @@ def test_ondemand_download_timetravel(
        # they are present only in the remote storage, only locally, or both.
        # It should not change.
        assert filled_current_physical == get_api_current_physical_size()
-        endpoint_old.stop()


 #
@@ -323,8 +323,8 @@ def test_download_remote_layers_api(
            "compaction_period": "0s",
            # small checkpoint distance to create more delta layer files
            "checkpoint_distance": f"{1 * 1024 ** 2}",  # 1 MB
-            "compaction_threshold": "999999",
-            "image_creation_threshold": "999999",
+            "compaction_threshold": "1",
+            "image_creation_threshold": "1",
            "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
        }
    )
@@ -357,20 +357,8 @@ def test_download_remote_layers_api(
            tenant_id, timeline_id, "pageserver_resident_physical_size"
        )

-    # Shut down safekeepers before starting the pageserver.
-    # If we don't, they might stream us more WAL.
-    for sk in env.safekeepers:
-        sk.stop()
-
-    # it is sad we cannot do a flush inmem layer without compaction, but
-    # working around with very high layer0 count and image layer creation
-    # threshold
-    client.timeline_checkpoint(tenant_id, timeline_id)
-
-    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
-
    filled_current_physical = get_api_current_physical_size()
-    log.info(f"filled_current_physical: {filled_current_physical}")
+    log.info(filled_current_physical)
    filled_size = get_resident_physical_size()
    log.info(f"filled_size: {filled_size}")
    assert filled_current_physical == filled_size, "we don't yet do layer eviction"
@@ -378,10 +366,18 @@ def test_download_remote_layers_api(
    env.pageserver.stop()

    # remove all the layer files
-    for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
+    # XXX only delete some of the layer files, to show that it really just downloads all the layers
+    for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
        log.info(f"unlinking layer {layer.name}")
        layer.unlink()

+    # Shut down safekeepers before starting the pageserver.
+    # If we don't, the tenant's walreceiver handler will trigger the
+    # the logical size computation task, and that downloads layes,
+    # which makes our assertions on size fail.
+    for sk in env.safekeepers:
+        sk.stop(immediate=True)
+
    ##### Second start, restore the data and ensure it's the same
    env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
    env.pageserver.allowed_errors.extend(
@@ -395,21 +391,32 @@ def test_download_remote_layers_api(

    ###### Phase 1: exercise download error code path

+    # comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload
+    # witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test.
    this_time = get_api_current_physical_size()
    assert (
-        filled_current_physical == this_time
+        filled_current_physical <= this_time
    ), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
+    if filled_current_physical != this_time:
+        log.info(
+            f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})"
+        )
+        filled_current_physical = this_time

    post_unlink_size = get_resident_physical_size()
    log.info(f"post_unlink_size: {post_unlink_size}")
    assert (
        post_unlink_size < filled_size
    ), "we just deleted layers and didn't cause anything to re-download them yet"
+    assert filled_size - post_unlink_size > 5 * (
+        1024**2
+    ), "we may be downloading some layers as part of tenant activation"

    # issue downloads that we know will fail
    info = client.timeline_download_remote_layers(
        tenant_id,
        timeline_id,
+        # allow some concurrency to unveil potential concurrency bugs
        max_concurrent_downloads=10,
        errors_ok=True,
        at_least_one_download=False,
@@ -418,9 +425,9 @@ def test_download_remote_layers_api(
    assert info["state"] == "Completed"
    assert info["total_layer_count"] > 0
    assert info["successful_download_count"] == 0
-    # can't assert == total_layer_count because timeline_detail also tries to
-    # download layers for logical size, but this might not always hold.
-    assert info["failed_download_count"] > 0
+    assert (
+        info["failed_download_count"] > 0
+    )  # can't assert == total_layer_count because attach + tenant status downloads some layers
    assert (
        info["total_layer_count"]
        == info["successful_download_count"] + info["failed_download_count"]
@@ -429,6 +436,7 @@ def test_download_remote_layers_api(
    assert (
        get_resident_physical_size() == post_unlink_size
    ), "didn't download anything new due to failpoint"
+    # would be nice to assert that the layers in the layer map are still RemoteLayer

    ##### Retry, this time without failpoints
    client.configure_failpoints(("remote-storage-download-pre-rename", "off"))
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -1,481 +0,0 @@
-import json
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from queue import SimpleQueue
-from typing import Any, Dict, Set
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    wait_for_last_flush_lsn,
-)
-from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.request import Request
-from werkzeug.wrappers.response import Response
-
-
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
-)
-def test_metric_collection(
-    httpserver: HTTPServer,
-    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
-    remote_storage_kind: RemoteStorageKind,
-):
-    (host, port) = httpserver_listen_address
-    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    # this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
-    uploads: SimpleQueue[Any] = SimpleQueue()
-
-    def metrics_handler(request: Request) -> Response:
-        if request.json is None:
-            return Response(status=400)
-
-        events = request.json["events"]
-        is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
-        assert is_last in ["true", "false"]
-        uploads.put((events, is_last == "true"))
-        return Response(status=200)
-
-    # Require collecting metrics frequently, since we change
-    # the timeline and want something to be logged about it.
-    #
-    # Disable time-based pitr, we will use the manual GC calls
-    # to trigger remote storage operations in a controlled way
-    neon_env_builder.pageserver_config_override = f"""
-        metric_collection_interval="1s"
-        metric_collection_endpoint="{metric_collection_endpoint}"
-        cached_metric_collection_interval="0s"
-        synthetic_size_calculation_interval="3s"
-        """
-
-    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
-
-    log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
-
-    # mock http server that returns OK for the metrics
-    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
-        metrics_handler
-    )
-
-    # spin up neon,  after http server is ready
-    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
-    # httpserver is shut down before pageserver during passing run
-    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
-    cur.execute(
-        """
-        INSERT INTO foo
-        SELECT g, 0, 'long string to consume some space' || g
-        FROM generate_series(1, 100000) g
-        """
-    )
-
-    # Helper function that gets the number of given kind of remote ops from the metrics
-    def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
-        ps_metrics = env.pageserver.http_client().get_metrics()
-        total = 0.0
-        for sample in ps_metrics.query_all(
-            name="pageserver_remote_operation_seconds_count",
-            filter={
-                "file_kind": str(file_kind),
-                "op_kind": str(op_kind),
-            },
-        ):
-            total += sample[2]
-        return int(total)
-
-    remote_uploaded = 0
-
-    # upload some data to remote storage
-    if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
-        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        pageserver_http = env.pageserver.http_client()
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
-
-        remote_uploaded = get_num_remote_ops("index", "upload")
-        assert remote_uploaded > 0
-
-    # we expect uploads at 1Hz, on busy runners this could be too optimistic,
-    # so give 5s we only want to get the following upload after "ready" value.
-    timeout = 5
-
-    # these strings in the upload queue allow synchronizing with the uploads
-    # and the main test execution
-    uploads.put("ready")
-
-    # note that this verifier graph should live across restarts as long as the
-    # cache file lives
-    v = MetricsVerifier()
-
-    while True:
-        events = uploads.get(timeout=timeout)
-
-        if events == "ready":
-            (events, is_last) = uploads.get(timeout=timeout)
-            v.ingest(events, is_last)
-            break
-        else:
-            (events, is_last) = events
-            v.ingest(events, is_last)
-
-    if "synthetic_storage_size" not in v.accepted_event_names():
-        log.info("waiting for synthetic storage size to be calculated and uploaded...")
-
-    rounds = 0
-    while "synthetic_storage_size" not in v.accepted_event_names():
-        (events, is_last) = uploads.get(timeout=timeout)
-        v.ingest(events, is_last)
-        rounds += 1
-        assert rounds < 10, "did not get synthetic_storage_size in 10 uploads"
-        # once we have it in verifiers, it will assert that future batches will contain it
-
-    env.pageserver.stop()
-    time.sleep(1)
-    uploads.put("ready")
-    env.pageserver.start()
-
-    while True:
-        events = uploads.get(timeout=timeout)
-
-        if events == "ready":
-            (events, is_last) = uploads.get(timeout=timeout * 3)
-            v.ingest(events, is_last)
-            (events, is_last) = uploads.get(timeout=timeout)
-            v.ingest(events, is_last)
-            break
-        else:
-            (events, is_last) = events
-            v.ingest(events, is_last)
-
-    httpserver.check()
-
-
-def test_metric_collection_cleans_up_tempfile(
-    httpserver: HTTPServer,
-    neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
-):
-    (host, port) = httpserver_listen_address
-    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-
-    # this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
-    uploads: SimpleQueue[Any] = SimpleQueue()
-
-    def metrics_handler(request: Request) -> Response:
-        if request.json is None:
-            return Response(status=400)
-
-        events = request.json["events"]
-        is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
-        assert is_last in ["true", "false"]
-        uploads.put((events, is_last == "true"))
-        return Response(status=200)
-
-    # Require collecting metrics frequently, since we change
-    # the timeline and want something to be logged about it.
-    #
-    # Disable time-based pitr, we will use the manual GC calls
-    # to trigger remote storage operations in a controlled way
-    neon_env_builder.pageserver_config_override = f"""
-        metric_collection_interval="1s"
-        metric_collection_endpoint="{metric_collection_endpoint}"
-        cached_metric_collection_interval="0s"
-        synthetic_size_calculation_interval="3s"
-        """
-
-    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
-
-    # mock http server that returns OK for the metrics
-    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
-        metrics_handler
-    )
-
-    # spin up neon,  after http server is ready
-    env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
-    pageserver_http = env.pageserver.http_client()
-
-    # httpserver is shut down before pageserver during passing run
-    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
-    cur.execute(
-        """
-        INSERT INTO foo
-        SELECT g, 0, 'long string to consume some space' || g
-        FROM generate_series(1, 100000) g
-        """
-    )
-
-    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-
-    # we expect uploads at 1Hz, on busy runners this could be too optimistic,
-    # so give 5s we only want to get the following upload after "ready" value.
-    timeout = 5
-
-    # these strings in the upload queue allow synchronizing with the uploads
-    # and the main test execution
-    uploads.put("ready")
-
-    while True:
-        events = uploads.get(timeout=timeout)
-
-        if events == "ready":
-            (events, _) = uploads.get(timeout=timeout)
-            break
-
-    # should really configure an env?
-    pageserver_http.configure_failpoints(("before-persist-last-metrics-collected", "exit"))
-
-    time.sleep(3)
-
-    env.pageserver.stop()
-
-    initially = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
-
-    assert (
-        len(initially.matching) == 2
-    ), f"expecting actual file and tempfile, but not found: {initially.matching}"
-
-    uploads.put("ready")
-    env.pageserver.start()
-
-    while True:
-        events = uploads.get(timeout=timeout * 3)
-
-        if events == "ready":
-            (events, _) = uploads.get(timeout=timeout)
-            break
-
-    env.pageserver.stop()
-
-    later = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
-
-    # it is possible we shutdown the pageserver right at the correct time, so the old tempfile
-    # is gone, but we also have a new one.
-    only = set(["last_consumption_metrics.json"])
-    assert (
-        initially.matching.intersection(later.matching) == only
-    ), "only initial tempfile should had been removed"
-    assert initially.other.issuperset(later.other), "no other files should had been removed"
-
-
-@dataclass
-class PrefixPartitionedFiles:
-    matching: Set[str]
-    other: Set[str]
-
-
-def iterate_pageserver_workdir(path: Path, prefix: str) -> PrefixPartitionedFiles:
-    """
-    Iterates the files in the workdir, returns two sets:
-    - files with the prefix
-    - files without the prefix
-    """
-
-    matching = set()
-    other = set()
-    for entry in path.iterdir():
-        if not entry.is_file():
-            continue
-
-        if not entry.name.startswith(prefix):
-            other.add(entry.name)
-        else:
-            matching.add(entry.name)
-
-    return PrefixPartitionedFiles(matching, other)
-
-
-class MetricsVerifier:
-    """
-    A graph of per tenant per timeline verifiers, allowing one for each
-    metric
-    """
-
-    def __init__(self):
-        self.tenants: Dict[TenantId, TenantMetricsVerifier] = {}
-        pass
-
-    def ingest(self, events, is_last):
-        stringified = json.dumps(events, indent=2)
-        log.info(f"ingesting: {stringified}")
-        for event in events:
-            id = TenantId(event["tenant_id"])
-            if id not in self.tenants:
-                self.tenants[id] = TenantMetricsVerifier(id)
-
-            self.tenants[id].ingest(event)
-
-        if is_last:
-            for t in self.tenants.values():
-                t.post_batch()
-
-    def accepted_event_names(self) -> Set[str]:
-        names: Set[str] = set()
-        for t in self.tenants.values():
-            names = names.union(t.accepted_event_names())
-        return names
-
-
-class TenantMetricsVerifier:
-    def __init__(self, id: TenantId):
-        self.id = id
-        self.timelines: Dict[TimelineId, TimelineMetricsVerifier] = {}
-        self.state: Dict[str, Any] = {}
-
-    def ingest(self, event):
-        assert TenantId(event["tenant_id"]) == self.id
-
-        if "timeline_id" in event:
-            id = TimelineId(event["timeline_id"])
-            if id not in self.timelines:
-                self.timelines[id] = TimelineMetricsVerifier(self.id, id)
-
-            self.timelines[id].ingest(event)
-        else:
-            name = event["metric"]
-            if name not in self.state:
-                self.state[name] = PER_METRIC_VERIFIERS[name]()
-            self.state[name].ingest(event, self)
-
-    def post_batch(self):
-        for v in self.state.values():
-            v.post_batch(self)
-
-        for tl in self.timelines.values():
-            tl.post_batch(self)
-
-    def accepted_event_names(self) -> Set[str]:
-        names = set(self.state.keys())
-        for t in self.timelines.values():
-            names = names.union(t.accepted_event_names())
-        return names
-
-
-class TimelineMetricsVerifier:
-    def __init__(self, tenant_id: TenantId, timeline_id: TimelineId):
-        self.id = timeline_id
-        self.state: Dict[str, Any] = {}
-
-    def ingest(self, event):
-        name = event["metric"]
-        if name not in self.state:
-            self.state[name] = PER_METRIC_VERIFIERS[name]()
-        self.state[name].ingest(event, self)
-
-    def post_batch(self, parent):
-        for v in self.state.values():
-            v.post_batch(self)
-
-    def accepted_event_names(self) -> Set[str]:
-        return set(self.state.keys())
-
-
-class CannotVerifyAnything:
-    """We can only assert types, but rust already has types, so no need."""
-
-    def __init__(self):
-        pass
-
-    def ingest(self, event, parent):
-        pass
-
-    def post_batch(self, parent):
-        pass
-
-
-class WrittenDataVerifier:
-    def __init__(self):
-        self.values = []
-        pass
-
-    def ingest(self, event, parent):
-        self.values.append(event["value"])
-
-    def post_batch(self, parent):
-        pass
-
-
-class WrittenDataDeltaVerifier:
-    def __init__(self):
-        self.value = None
-        self.sum = 0
-        self.timerange = None
-        pass
-
-    def ingest(self, event, parent):
-        assert event["type"] == "incremental"
-        self.value = event["value"]
-        self.sum += event["value"]
-        start = event["start_time"]
-        stop = event["stop_time"]
-        timerange = (start, stop)
-        if self.timerange is not None:
-            # this holds across restarts
-            assert self.timerange[1] == timerange[0], "time ranges should be continious"
-        self.timerange = timerange
-
-    def post_batch(self, parent):
-        absolute = parent.state["written_size"]
-        if len(absolute.values) == 1:
-            # in tests this comes up as initdb execution, so we can have 0 or
-            # about 30MB on the first event. it is not consistent.
-            assert self.value is not None
-        else:
-            assert self.value == absolute.values[-1] - absolute.values[-2]
-            # sounds like this should hold, but it will not for branches -- probably related to timing
-            # assert self.sum == absolute.latest
-
-
-class SyntheticSizeVerifier:
-    def __init__(self):
-        self.prev = None
-        self.value = None
-        pass
-
-    def ingest(self, event, parent):
-        assert isinstance(parent, TenantMetricsVerifier)
-        assert event["type"] == "absolute"
-        value = event["value"]
-        self.value = value
-
-    def post_batch(self, parent):
-        if self.prev is not None:
-            # this is assuming no one goes and deletes the cache file
-            assert (
-                self.value is not None
-            ), "after calculating first synthetic size, cached or more recent should be sent"
-        self.prev = self.value
-        self.value = None
-
-
-PER_METRIC_VERIFIERS = {
-    "remote_storage_size": CannotVerifyAnything,
-    "resident_size": CannotVerifyAnything,
-    "written_size": WrittenDataVerifier,
-    "written_data_bytes_delta": WrittenDataDeltaVerifier,
-    "timeline_logical_size": CannotVerifyAnything,
-    "synthetic_storage_size": SyntheticSizeVerifier,
-}
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -346,13 +346,23 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
    static_proxy.safe_psql("create user http_auth with password 'http' superuser")

    def get_pid(status: int, pw: str) -> Any:
-        return static_proxy.http_query(
-            "SELECT pid FROM pg_stat_activity WHERE state = 'active'",
-            [],
-            user="http_auth",
-            password=pw,
-            expected_code=status,
+        connstr = (
+            f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(
+                {"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
+            ),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Pool-Opt-In": "true",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == status
+        return response.json()

    pid1 = get_pid(200, "http")["rows"][0]["pid"]

@@ -377,23 +387,3 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
    # old password should not work
    res = get_pid(400, "http")
    assert "password authentication failed for user" in res["message"]
-
-
-# Beginning a transaction should not impact the next query,
-# which might come from a completely different client.
-@pytest.mark.xfail(reason="not implemented")
-def test_http_pool_begin(static_proxy: NeonProxy):
-    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
-
-    def query(status: int, query: str, *args) -> Any:
-        static_proxy.http_query(
-            query,
-            args,
-            user="http_auth",
-            password="http",
-            expected_code=status,
-        )
-
-    query(200, "BEGIN;")
-    query(400, "garbage-lol(&(&(&(&")  # Intentional error to break the transaction
-    query(200, "SELECT 1;")  # Query that should succeed regardless of the transaction
--- a/test_runner/regress/test_proxy_metric_collection.py
+++ b/test_runner/regress/test_proxy_metric_collection.py
@@ -1,113 +0,0 @@
-from pathlib import Path
-from typing import Iterator
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    PSQL,
-    NeonProxy,
-    VanillaPostgres,
-)
-from fixtures.port_distributor import PortDistributor
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.request import Request
-from werkzeug.wrappers.response import Response
-
-
-def proxy_metrics_handler(request: Request) -> Response:
-    if request.json is None:
-        return Response(status=400)
-
-    events = request.json["events"]
-    log.info("received events:")
-    log.info(events)
-
-    # perform basic sanity checks
-    for event in events:
-        assert event["metric"] == "proxy_io_bytes_per_client"
-        assert event["endpoint_id"] == "test_endpoint_id"
-        assert event["value"] >= 0
-        assert event["stop_time"] >= event["start_time"]
-
-    return Response(status=200)
-
-
-@pytest.fixture(scope="function")
-def proxy_with_metric_collector(
-    port_distributor: PortDistributor,
-    neon_binpath: Path,
-    httpserver_listen_address,
-    test_output_dir: Path,
-) -> Iterator[NeonProxy]:
-    """Neon proxy that routes through link auth and has metric collection enabled."""
-
-    http_port = port_distributor.get_port()
-    proxy_port = port_distributor.get_port()
-    mgmt_port = port_distributor.get_port()
-    external_http_port = port_distributor.get_port()
-
-    (host, port) = httpserver_listen_address
-    metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
-    metric_collection_interval = "5s"
-
-    with NeonProxy(
-        neon_binpath=neon_binpath,
-        test_output_dir=test_output_dir,
-        proxy_port=proxy_port,
-        http_port=http_port,
-        mgmt_port=mgmt_port,
-        external_http_port=external_http_port,
-        metric_collection_endpoint=metric_collection_endpoint,
-        metric_collection_interval=metric_collection_interval,
-        auth_backend=NeonProxy.Link(),
-    ) as proxy:
-        proxy.start()
-        yield proxy
-
-
-@pytest.mark.asyncio
-async def test_proxy_metric_collection(
-    httpserver: HTTPServer,
-    proxy_with_metric_collector: NeonProxy,
-    vanilla_pg: VanillaPostgres,
-):
-    # mock http server that returns OK for the metrics
-    httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
-        proxy_metrics_handler
-    )
-
-    # do something to generate load to generate metrics
-    # sleep for 5 seconds to give metric collector time to collect metrics
-    psql = await PSQL(
-        host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
-    ).run(
-        "create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42"
-    )
-
-    base_uri = proxy_with_metric_collector.link_auth_uri
-    link = await NeonProxy.find_auth_link(base_uri, psql)
-
-    psql_session_id = NeonProxy.get_session_id(base_uri, link)
-    await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id)
-
-    assert psql.stdout is not None
-    out = (await psql.stdout.read()).decode("utf-8").strip()
-    assert out == "42"
-
-    # do something to generate load to generate metrics
-    # sleep for 5 seconds to give metric collector time to collect metrics
-    psql = await PSQL(
-        host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
-    ).run("insert into tbl select * from generate_series(0,1000);  select pg_sleep(5); select 42")
-
-    link = await NeonProxy.find_auth_link(base_uri, psql)
-    psql_session_id = NeonProxy.get_session_id(base_uri, link)
-    await NeonProxy.activate_link_auth(
-        vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False
-    )
-
-    assert psql.stdout is not None
-    out = (await psql.stdout.read()).decode("utf-8").strip()
-    assert out == "42"
-
-    httpserver.check()
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -6,6 +6,7 @@ import queue
 import shutil
 import threading
 import time
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple

 import pytest
@@ -136,7 +137,7 @@ def test_remote_storage_backup_and_restore(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -352,7 +353,7 @@ def test_remote_storage_upload_queue_retries(
    env.pageserver.stop(immediate=True)
    env.endpoints.stop_all()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -487,7 +488,7 @@ def test_remote_timeline_client_calls_started_metric(
    env.pageserver.stop(immediate=True)
    env.endpoints.stop_all()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -532,7 +533,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.timeline_dir(tenant_id, timeline_id)

    client = env.pageserver.http_client()

@@ -703,9 +704,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
    # index upload is now hitting the failpoint, it should block the shutdown
    env.pageserver.stop(immediate=True)

-    local_metadata = (
-        env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
-    )
+    local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
    assert local_metadata.is_file()

    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -299,7 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):

    # tenant is created with defaults, as in without config file
    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
-    config_path = env.pageserver.tenant_dir(tenant_id) / "config"
+    config_path = env.pageserver.workdir / "tenants" / str(tenant_id) / "config"
    assert config_path.exists(), "config file is always initially created"

    http_client = env.pageserver.http_client()
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -89,7 +89,7 @@ def test_tenant_delete_smoke(

    tenant_delete_wait_completed(ps_http, tenant_id, iterations)

-    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
    assert not tenant_path.exists()

    if remote_storage_kind in available_s3_storages():
@@ -269,7 +269,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(

        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)

-    tenant_dir = env.pageserver.tenant_dir(tenant_id)
+    tenant_dir = env.tenant_dir(tenant_id)
    # Check local is empty
    assert not tenant_dir.exists()

@@ -366,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = env.pageserver.workdir / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -379,7 +379,7 @@ def test_tenant_delete_is_resumed_on_attach(
    wait_tenant_status_404(ps_http, tenant_id, iterations)

    # we shouldn've created tenant dir on disk
-    tenant_path = env.pageserver.tenant_dir(tenant_id)
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
    assert not tenant_path.exists()

    if remote_storage_kind in available_s3_storages():
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -119,6 +119,65 @@ def test_tenant_reattach(

 num_connections = 10
 num_rows = 100000
+updates_to_perform = 0
+
+updates_started = 0
+updates_finished = 0
+
+
+# Run random UPDATEs on test table. On failure, try again.
+async def update_table(pg_conn: asyncpg.Connection):
+    global updates_started, updates_finished, updates_to_perform
+
+    while updates_started < updates_to_perform or updates_to_perform == 0:
+        updates_started += 1
+        id = random.randrange(1, num_rows)
+
+        # Loop to retry until the UPDATE succeeds
+        while True:
+            try:
+                await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
+                updates_finished += 1
+                if updates_finished % 1000 == 0:
+                    log.info(f"update {updates_finished} / {updates_to_perform}")
+                break
+            except asyncpg.PostgresError as e:
+                # Received error from Postgres. Log it, sleep a little, and continue
+                log.info(f"UPDATE error: {e}")
+                await asyncio.sleep(0.1)
+
+
+async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    global updates_started, updates_finished, updates_to_perform
+
+    # Wait until we have performed some updates
+    wait_until(20, 0.5, lambda: updates_finished > 500)
+
+    log.info("Detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    await asyncio.sleep(1)
+    log.info("Re-attaching tenant")
+    pageserver_http.tenant_attach(tenant_id)
+    log.info("Re-attach finished")
+
+    # Continue with 5000 more updates
+    updates_to_perform = updates_started + 5000
+
+
+# async guts of test_tenant_reattach_while_bysy test
+async def reattach_while_busy(
+    env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId
+):
+    workers = []
+    for _ in range(num_connections):
+        pg_conn = await endpoint.connect_async()
+        workers.append(asyncio.create_task(update_table(pg_conn)))
+
+    workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
+    await asyncio.gather(*workers)
+
+    assert updates_finished == updates_to_perform
+

 # Detach and re-attach tenant, while compute is busy running queries.
 #
@@ -167,62 +226,6 @@ def test_tenant_reattach_while_busy(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
 ):
-    updates_started = 0
-    updates_finished = 0
-    updates_to_perform = 0
-
-    # Run random UPDATEs on test table. On failure, try again.
-    async def update_table(pg_conn: asyncpg.Connection):
-        nonlocal updates_started, updates_finished, updates_to_perform
-
-        while updates_started < updates_to_perform or updates_to_perform == 0:
-            updates_started += 1
-            id = random.randrange(1, num_rows)
-
-            # Loop to retry until the UPDATE succeeds
-            while True:
-                try:
-                    await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
-                    updates_finished += 1
-                    if updates_finished % 1000 == 0:
-                        log.info(f"update {updates_finished} / {updates_to_perform}")
-                    break
-                except asyncpg.PostgresError as e:
-                    # Received error from Postgres. Log it, sleep a little, and continue
-                    log.info(f"UPDATE error: {e}")
-                    await asyncio.sleep(0.1)
-
-    async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
-        nonlocal updates_started, updates_finished, updates_to_perform
-
-        # Wait until we have performed some updates
-        wait_until(20, 0.5, lambda: updates_finished > 500)
-
-        log.info("Detaching tenant")
-        pageserver_http.tenant_detach(tenant_id)
-        await asyncio.sleep(1)
-        log.info("Re-attaching tenant")
-        pageserver_http.tenant_attach(tenant_id)
-        log.info("Re-attach finished")
-
-        # Continue with 5000 more updates
-        updates_to_perform = updates_started + 5000
-
-    # async guts of test_tenant_reattach_while_bysy test
-    async def reattach_while_busy(
-        env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId
-    ):
-        nonlocal updates_to_perform, updates_finished
-        workers = []
-        for _ in range(num_connections):
-            pg_conn = await endpoint.connect_async()
-            workers.append(asyncio.create_task(update_table(pg_conn)))
-
-        workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
-        await asyncio.gather(*workers)
-
-        assert updates_finished == updates_to_perform
-
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
    env = neon_env_builder.init_start()

@@ -286,7 +289,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    )

    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
+    assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    # we rely upon autocommit after each statement
@@ -329,7 +332,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    log.info("gc thread returned")

    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
+    assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    with pytest.raises(
        expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
@@ -354,7 +357,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
    )

    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
+    assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    # we rely upon autocommit after each statement
@@ -383,7 +386,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
    log.info("ignored tenant detached without error")

    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
+    assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    # assert the tenant does not exists in the Pageserver
    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -410,7 +413,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
    )

    # assert tenant exists on disk
-    assert env.pageserver.tenant_dir(tenant_id).exists()
+    assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    # we rely upon autocommit after each statement
@@ -427,7 +430,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
    log.info("regular tenant detached without error")

    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
+    assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    # assert the tenant does not exists in the Pageserver
    tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -528,7 +531,7 @@ def test_ignored_tenant_reattach(
    pageserver_http = env.pageserver.http_client()

    ignored_tenant_id, _ = env.neon_cli.create_tenant()
-    tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
+    tenant_dir = env.pageserver.workdir / "tenants" / str(ignored_tenant_id)
    tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
    tenants_before_ignore.sort()
    timelines_before_ignore = [
@@ -619,7 +622,7 @@ def test_ignored_tenant_download_missing_layers(

    # ignore the tenant and remove its layers
    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
    layers_removed = False
    for dir_entry in timeline_dir.iterdir():
        if dir_entry.name.startswith("00000"):
@@ -672,7 +675,7 @@ def test_ignored_tenant_stays_broken_without_metadata(

    # ignore the tenant and remove its metadata
    pageserver_http.tenant_ignore(tenant_id)
-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
    metadata_removed = False
    for dir_entry in timeline_dir.iterdir():
        if dir_entry.name == "metadata":
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -216,7 +216,7 @@ def switch_pg_to_new_pageserver(

    endpoint.start()

-    timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
    files_before_detach = os.listdir(timeline_to_detach_local_path)
    assert (
        "metadata" in files_before_detach
@@ -561,7 +561,7 @@ def test_emergency_relocate_with_branches_slow_replay(
    # simpler than initializing a new one from scratch, but the effect on the single tenant
    # is the same.
    env.pageserver.stop(immediate=True)
-    shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
+    shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
    env.pageserver.start()

    # This fail point will pause the WAL ingestion on the main branch, after the
@@ -709,7 +709,7 @@ def test_emergency_relocate_with_branches_createdb(

    # Kill the pageserver, remove the tenant directory, and restart
    env.pageserver.stop(immediate=True)
-    shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
+    shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
    env.pageserver.start()

    # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -27,7 +27,7 @@ from prometheus_client.samples import Sample


 def test_tenant_creation_fails(neon_simple_env: NeonEnv):
-    tenants_dir = neon_simple_env.pageserver.tenant_dir()
+    tenants_dir = Path(neon_simple_env.pageserver.workdir) / "tenants"
    initial_tenants = sorted(
        map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
    )
@@ -320,7 +320,13 @@ def test_pageserver_with_empty_tenants(
        )

    files_in_timelines_dir = sum(
-        1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines))
+        1
+        for _p in Path.iterdir(
+            Path(env.pageserver.workdir)
+            / "tenants"
+            / str(tenant_with_empty_timelines)
+            / "timelines"
+        )
    )
    assert (
        files_in_timelines_dir == 0
@@ -331,7 +337,9 @@ def test_pageserver_with_empty_tenants(
    env.pageserver.stop()

    tenant_without_timelines_dir = env.initial_tenant
-    shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir))
+    shutil.rmtree(
+        Path(env.pageserver.workdir) / "tenants" / str(tenant_without_timelines_dir) / "timelines"
+    )

    env.pageserver.start()

--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -179,7 +179,9 @@ def test_tenants_attached_after_download(

    env.pageserver.stop()

-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_dir = (
+        Path(env.pageserver.workdir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    )
    local_layer_deleted = False
    for path in Path.iterdir(timeline_dir):
        if path.name.startswith("00000"):
@@ -257,7 +259,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_dir = env.timeline_dir(tenant_id, timeline_id)
    local_layer_truncated = None
    for path in Path.iterdir(timeline_dir):
        if path.name.startswith("00000"):
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -32,9 +32,7 @@ def test_threshold_based_eviction(
        synthetic_size_calculation_interval="2s"
        metric_collection_endpoint="http://{host}:{port}/nonexistent"
    """
-    metrics_refused_log_line = (
-        ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*"
-    )
+    metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*"
    env = neon_env_builder.init_start()
    env.pageserver.allowed_errors.append(metrics_refused_log_line)

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -3,6 +3,7 @@ import os
 import queue
 import shutil
 import threading
+from pathlib import Path

 import pytest
 import requests
@@ -71,7 +72,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
        "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
    )

-    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
+    timeline_path = (
+        env.pageserver.workdir
+        / "tenants"
+        / str(env.initial_tenant)
+        / "timelines"
+        / str(parent_timeline_id)
+    )

    with pytest.raises(
        PageserverApiException, match="Cannot delete timeline which has child timelines"
@@ -82,7 +89,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):

    assert exc.value.status_code == 412

-    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
+    timeline_path = (
+        env.pageserver.workdir
+        / "tenants"
+        / str(env.initial_tenant)
+        / "timelines"
+        / str(leaf_timeline_id)
+    )
    assert timeline_path.exists()

    # retry deletes when compaction or gc is running in pageserver
@@ -323,7 +336,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            ),
        )

-    timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
+    timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
    # Check local is empty
    assert not timeline_dir.exists()
    # Check no delete mark present
@@ -403,7 +416,7 @@ def test_timeline_resurrection_on_attach(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -454,7 +467,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        "test_timeline_delete_fail_before_local_delete",
    )

-    leaf_timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
+    leaf_timeline_path = (
+        env.pageserver.workdir
+        / "tenants"
+        / str(env.initial_tenant)
+        / "timelines"
+        / str(leaf_timeline_id)
+    )

    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
    timeline_info = wait_until_timeline_state(
@@ -902,7 +921,7 @@ def test_timeline_delete_resumed_on_attach(
    env.endpoints.stop_all()
    env.pageserver.stop()

-    dir_to_clear = env.pageserver.tenant_dir()
+    dir_to_clear = Path(env.pageserver.workdir) / "tenants"
    shutil.rmtree(dir_to_clear)
    os.mkdir(dir_to_clear)

@@ -914,7 +933,7 @@ def test_timeline_delete_resumed_on_attach(
    # delete should be resumed
    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)

-    tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id)
    assert not tenant_path.exists()

    if remote_storage_kind in available_s3_storages():
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -518,7 +518,7 @@ def test_timeline_size_metrics(
    ).value

    # assert that the physical size metric matches the actual physical size on disk
-    timeline_path = env.pageserver.timeline_dir(env.initial_tenant, new_timeline_id)
+    timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
    assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)

    # Check that the logical size metric is sane, and matches
@@ -658,7 +658,7 @@ def get_physical_size_values(
    )
    res.api_current_physical = detail["current_physical_size"]

-    timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
+    timeline_path = env.timeline_dir(tenant_id, timeline_id)
    res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)

    return res
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -19,40 +19,18 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
    # Install extension containing function needed for test
    cur.execute("CREATE EXTENSION neon_test_utils")

-    # Create a test table for a few different scenarios and freeze it to set the VM bits.
+    # Create a test table and freeze it to set the VM bit.
    cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)")
    cur.execute("INSERT INTO vmtest_delete VALUES (1)")
    cur.execute("VACUUM FREEZE vmtest_delete")

-    cur.execute("CREATE TABLE vmtest_hot_update (id integer PRIMARY KEY, filler text)")
-    cur.execute("INSERT INTO vmtest_hot_update VALUES (1, 'x')")
-    cur.execute("VACUUM FREEZE vmtest_hot_update")
-
-    cur.execute("CREATE TABLE vmtest_cold_update (id integer PRIMARY KEY)")
-    cur.execute("INSERT INTO vmtest_cold_update SELECT g FROM generate_series(1, 1000) g")
-    cur.execute("VACUUM FREEZE vmtest_cold_update")
-
-    cur.execute(
-        "CREATE TABLE vmtest_cold_update2 (id integer PRIMARY KEY, filler text) WITH (fillfactor=100)"
-    )
-    cur.execute("INSERT INTO vmtest_cold_update2 SELECT g, '' FROM generate_series(1, 1000) g")
-    cur.execute("VACUUM FREEZE vmtest_cold_update2")
+    cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)")
+    cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g")
+    cur.execute("VACUUM FREEZE vmtest_update")

    # DELETE and UPDATE the rows.
    cur.execute("DELETE FROM vmtest_delete WHERE id = 1")
-    cur.execute("UPDATE vmtest_hot_update SET filler='x' WHERE id = 1")
-    cur.execute("UPDATE vmtest_cold_update SET id = 5000 WHERE id = 1")
-
-    # Clear the VM bit on the last page with an INSERT. Then clear the VM bit on
-    # the page where row 1 is (block 0), by doing an UPDATE. The UPDATE is a
-    # cold update, and the new tuple goes to the last page, which already had
-    # its VM bit cleared. The point is that the UPDATE *only* clears the VM bit
-    # on the page containing the old tuple. We had a bug where we got the old
-    # and new pages mixed up, and that only shows up when one of the bits is
-    # cleared, but not the other one.
-    cur.execute("INSERT INTO vmtest_cold_update2 VALUES (9999, 'x')")
-    # Clears the VM bit on the old page
-    cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
+    cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1")

    # Branch at this point, to test that later
    fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
@@ -72,13 +50,9 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
    """
    )

-    cur.execute("SELECT id FROM vmtest_delete WHERE id = 1")
+    cur.execute("SELECT * FROM vmtest_delete WHERE id = 1")
    assert cur.fetchall() == []
-    cur.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
-    assert cur.fetchall() == [(1,)]
-    cur.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
-    assert cur.fetchall() == []
-    cur.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
+    cur.execute("SELECT * FROM vmtest_update WHERE id = 1")
    assert cur.fetchall() == []

    cur.close()
@@ -103,111 +77,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
    """
    )

-    cur_new.execute("SELECT id FROM vmtest_delete WHERE id = 1")
+    cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1")
    assert cur_new.fetchall() == []
-    cur_new.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
-    assert cur_new.fetchall() == [(1,)]
-    cur_new.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
+    cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1")
    assert cur_new.fetchall() == []
-    cur_new.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
-    assert cur_new.fetchall() == []
-
-
-#
-# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
-# record.
-#
-def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty")
-    endpoint = env.endpoints.create_start(
-        "test_vm_bit_clear_on_heap_lock",
-        config_lines=[
-            "log_autovacuum_min_duration = 0",
-            # Perform anti-wraparound vacuuming aggressively
-            "autovacuum_naptime='1 s'",
-            "autovacuum_freeze_max_age = 1000000",
-        ],
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    # Install extension containing function needed for test
-    cur.execute("CREATE EXTENSION neon_test_utils")
-
-    cur.execute("SELECT pg_switch_wal()")
-
-    # Create a test table and freeze it to set the all-frozen VM bit on all pages.
-    cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
-    cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
-    cur.execute("VACUUM FREEZE vmtest_lock")
-
-    # Lock a row. This clears the all-frozen VM bit for that page.
-    cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
-
-    # Remember the XID. We will use it later to verify that we have consumed a lot of
-    # XIDs after this.
-    cur.execute("select pg_current_xact_id()")
-    locking_xid = cur.fetchall()[0][0]
-
-    # Stop and restart postgres, to clear the buffer cache.
-    #
-    # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
-    # in a "clean" way. Our neon extension will write a full-page image of the VM
-    # page, and we want to avoid that.
-    endpoint.stop()
-    endpoint.start()
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-    tup = cur.fetchall()
-    xmax_before = tup[0][1]
-
-    # Consume a lot of XIDs, so that anti-wraparound autovacuum kicks
-    # in and the clog gets truncated. We set autovacuum_freeze_max_age to a very
-    # low value, so it doesn't take all that many XIDs for autovacuum to kick in.
-    for i in range(1000):
-        cur.execute(
-            """
-        CREATE TEMP TABLE othertable (i int) ON COMMIT DROP;
-        do $$
-        begin
-          for i in 1..100000 loop
-            -- Use a begin-exception block to generate a new subtransaction on each iteration
-            begin
-              insert into othertable values (i);
-            exception when others then
-              raise 'not expected %', sqlerrm;
-            end;
-          end loop;
-        end;
-        $$;
-        """
-        )
-        cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
-        tup = cur.fetchall()
-        log.info(f"tuple = {tup}")
-        xmax = tup[0][1]
-        assert xmax == xmax_before
-
-        if i % 50 == 0:
-            cur.execute("select datfrozenxid from pg_database where datname='postgres'")
-            datfrozenxid = cur.fetchall()[0][0]
-            if datfrozenxid > locking_xid:
-                break
-
-    cur.execute("select pg_current_xact_id()")
-    curr_xid = cur.fetchall()[0][0]
-    assert int(curr_xid) - int(locking_xid) >= 100000
-
-    # Now, if the VM all-frozen bit was not correctly cleared on
-    # replay, we will try to fetch the status of the XID that was
-    # already truncated away.
-    #
-    # ERROR: could not access status of transaction 1027
-    cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
-    tup = cur.fetchall()
-    log.info(f"tuple = {tup}")
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -14,8 +14,6 @@ from pathlib import Path
 from typing import Any, List, Optional

 import psycopg2
-import psycopg2.errors
-import psycopg2.extras
 import pytest
 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
@@ -262,7 +260,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder):
            else:
                failed_node.start()
                failed_node = None
-    assert query_scalar(cur, "SELECT sum(key) FROM t") == (n_inserts * (n_inserts + 1)) // 2
+    assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500


 # Test that safekeepers push their info to the broker and learn peer status from it
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -43,7 +43,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
    tenant_id, _ = env.neon_cli.create_tenant()

    # assert tenant exists on disk
-    assert (env.pageserver.tenant_dir(tenant_id)).exists()
+    assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

@@ -101,7 +101,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
        pytest.fail(f"could not detach tenant: {last_error}")

    # check that nothing is left on disk for deleted tenant
-    assert not env.pageserver.tenant_dir(tenant_id).exists()
+    assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()

    # Pageserver schedules kill+wait of the WAL redo process to the background runtime,
    # asynchronously to tenant detach. Cut it some slack to complete kill+wait before
Author	SHA1	Message	Date
Alexander Bayandin	527727d0f5	Merge branch 'main' into vk/pg_embedding_v16_support	2023-09-12 20:09:12 +01:00
Alexander Bayandin	d770e88886	Update checksum from pg_embedding source code	2023-09-12 20:08:20 +01:00
Vadim Kharitonov	a34bdc6eee	pg_embedding with Pg16 support	2023-09-11 13:02:52 +02:00