Add size metric and test

2026-01-15 09:22:55 +00:00 · 2023-06-12 18:25:11 -04:00
parent 4936ab6842
commit 45b71fecec
3 changed files with 68 additions and 1 deletions
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,5 @@
 use std::fs;
+use std::io::Read;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -133,6 +134,50 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

+/// Wrapper for a reader that counts bytes and reports metrics.
+///
+/// HACK The interface of this struct is a little funny, mostly because we want
+///      to use it as input for tar::Archive::new(reader), which for some reason
+///      takes ownership of the reader instead of just &mut. So we can't access
+///      the reader to read the byte count because we lose ownership. Instead we
+///      pass the ComputeNode inside the struct and update metrics on Drop.
+struct ByteCounter<'a, R: Read> {
+    inner: R,
+    byte_count: usize,
+    compute_node: &'a ComputeNode,
+}
+
+impl<'a, R: Read> ByteCounter<'a, R> {
+    fn new(reader: R, compute_node: &'a ComputeNode) -> Self {
+        Self {
+            inner: reader,
+            byte_count: 0,
+            compute_node,
+        }
+    }
+}
+
+impl<R: Read> Read for ByteCounter<'_, R> {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        let result = self.inner.read(buf);
+        if let Ok(n_bytes) = result {
+            self.byte_count += n_bytes
+        }
+        result
+    }
+}
+
+impl<R: Read> Drop for ByteCounter<'_, R> {
+    fn drop(&mut self) {
+        self.compute_node
+            .state
+            .lock()
+            .unwrap()
+            .metrics
+            .basebackup_bytes = self.byte_count as u64;
+    }
+}
+
 impl ComputeNode {
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
@@ -179,13 +224,14 @@ impl ComputeNode {
            _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn),
        };
        let copyreader = client.copy_out(basebackup_cmd.as_str())?;
+        let read_counter = ByteCounter::new(copyreader, self);

        // Read the archive directly from the `CopyOutReader`
        //
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
+        let mut ar = tar::Archive::new(read_counter);
        ar.set_ignore_zeros(true);
        ar.unpack(&self.pgdata)?;

--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -71,6 +71,7 @@ pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
    pub basebackup_ms: u64,
+    pub basebackup_bytes: u64,
    pub config_ms: u64,
    pub total_startup_ms: u64,
 }
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -6,6 +6,26 @@ from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder


+@pytest.mark.xfail  # We currently pass a 16MB pg_wal dir instead of creating it client-side
+def test_basebackup_size(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    # Start
+    env.neon_cli.create_branch("test_startup")
+    endpoint = env.endpoints.create_start("test_startup")
+
+    # Get metrics
+    metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+    basebackup_bytes = metrics["basebackup_bytes"]
+    zenbenchmark.record(
+        "basebackup_size", basebackup_bytes, "bytes", report=MetricReport.LOWER_IS_BETTER
+    )
+
+    # Seems like a reasonable limit, but increase it if it becomes impossible to meet
+    assert basebackup_bytes < 70 * 1024
+
+
 # Just start and measure duration.
 #
 # This test runs pretty quickly and can be informative when used in combination