From 45b71fececc73ba8ce80a1fa093c47bf3dcf06eb Mon Sep 17 00:00:00 2001
From: Bojan Serafimov <bojan.serafimov7@gmail.com>
Date: Mon, 12 Jun 2023 18:25:11 -0400
Subject: [PATCH] Add size metric and test

---
 compute_tools/src/compute.rs            | 48 ++++++++++++++++++++++++-
 libs/compute_api/src/responses.rs       |  1 +
 test_runner/performance/test_startup.py | 20 +++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 977708a18f..c98b267beb 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,5 @@
 use std::fs;
+use std::io::Read;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -133,6 +134,50 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
     }
 }
 
+/// Wrapper for a reader that counts bytes and reports metrics.
+///
+/// HACK The interface of this struct is a little funny, mostly because we want
+///      to use it as input for tar::Archive::new(reader), which for some reason
+///      takes ownership of the reader instead of just &mut. So we can't access
+///      the reader to read the byte count because we lose ownership. Instead we
+///      pass the ComputeNode inside the struct and update metrics on Drop.
+struct ByteCounter<'a, R: Read> {
+    inner: R,
+    byte_count: usize,
+    compute_node: &'a ComputeNode,
+}
+
+impl<'a, R: Read> ByteCounter<'a, R> {
+    fn new(reader: R, compute_node: &'a ComputeNode) -> Self {
+        Self {
+            inner: reader,
+            byte_count: 0,
+            compute_node,
+        }
+    }
+}
+
+impl<R: Read> Read for ByteCounter<'_, R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        let result = self.inner.read(buf);
+        if let Ok(n_bytes) = result {
+            self.byte_count += n_bytes
+        }
+        result
+    }
+}
+
+impl<R: Read> Drop for ByteCounter<'_, R> {
+    fn drop(&mut self) {
+        self.compute_node
+            .state
+            .lock()
+            .unwrap()
+            .metrics
+            .basebackup_bytes = self.byte_count as u64;
+    }
+}
+
 impl ComputeNode {
     pub fn set_status(&self, status: ComputeStatus) {
         let mut state = self.state.lock().unwrap();
@@ -179,13 +224,14 @@ impl ComputeNode {
             _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
         };
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+        let read_counter = ByteCounter::new(copyreader, self);
 
         // Read the archive directly from the `CopyOutReader`
         //
         // Set `ignore_zeros` so that unpack() reads all the Copy data and
         // doesn't stop at the end-of-archive marker. Otherwise, if the server
         // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
+        let mut ar = tar::Archive::new(read_counter);
         ar.set_ignore_zeros(true);
         ar.unpack(&self.pgdata)?;
 
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index ce73dda08a..ce982863a5 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
     pub wait_for_spec_ms: u64,
     pub sync_safekeepers_ms: u64,
     pub basebackup_ms: u64,
+    pub basebackup_bytes: u64,
     pub config_ms: u64,
     pub total_startup_ms: u64,
 }
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index 9c45088d62..c37dc7e069 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -6,6 +6,26 @@ from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
+@pytest.mark.xfail  # We currently pass a 16MB pg_wal dir instead of creating it client-side
+def test_basebackup_size(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    # Start
+    env.neon_cli.create_branch("test_startup")
+    endpoint = env.endpoints.create_start("test_startup")
+
+    # Get metrics
+    metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+    basebackup_bytes = metrics["basebackup_bytes"]
+    zenbenchmark.record(
+        "basebackup_size", basebackup_bytes, "bytes", report=MetricReport.LOWER_IS_BETTER
+    )
+
+    # Seems like a reasonable limit, but increase it if it becomes impossible to meet
+    assert basebackup_bytes < 70 * 1024
+
+
 # Just start and measure duration.
 #
 # This test runs pretty quickly and can be informative when used in combination