From 45b71fececc73ba8ce80a1fa093c47bf3dcf06eb Mon Sep 17 00:00:00 2001 From: Bojan Serafimov Date: Mon, 12 Jun 2023 18:25:11 -0400 Subject: [PATCH] Add size metric and test --- compute_tools/src/compute.rs | 48 ++++++++++++++++++++++++- libs/compute_api/src/responses.rs | 1 + test_runner/performance/test_startup.py | 20 +++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 977708a18f..c98b267beb 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,5 @@ use std::fs; +use std::io::Read; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; @@ -133,6 +134,50 @@ impl TryFrom for ParsedSpec { } } +/// Wrapper for a reader that counts bytes and reports metrics. +/// +/// HACK The interface of this struct is a little funny, mostly because we want +/// to use it as input for tar::Archive::new(reader), which for some reason +/// takes ownership of the reader instead of just &mut. So we can't access +/// the reader to read the byte count because we lose ownership. Instead we +/// pass the ComputeNode inside the struct and update metrics on Drop. +struct ByteCounter<'a, R: Read> { + inner: R, + byte_count: usize, + compute_node: &'a ComputeNode, +} + +impl<'a, R: Read> ByteCounter<'a, R> { + fn new(reader: R, compute_node: &'a ComputeNode) -> Self { + Self { + inner: reader, + byte_count: 0, + compute_node, + } + } +} + +impl Read for ByteCounter<'_, R> { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let result = self.inner.read(buf); + if let Ok(n_bytes) = result { + self.byte_count += n_bytes + } + result + } +} + +impl Drop for ByteCounter<'_, R> { + fn drop(&mut self) { + self.compute_node + .state + .lock() + .unwrap() + .metrics + .basebackup_bytes = self.byte_count as u64; + } +} + impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); @@ -179,13 +224,14 @@ impl ComputeNode { _ => format!("basebackup {} {} {}", spec.tenant_id, spec.timeline_id, lsn), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; + let read_counter = ByteCounter::new(copyreader, self); // Read the archive directly from the `CopyOutReader` // // Set `ignore_zeros` so that unpack() reads all the Copy data and // doesn't stop at the end-of-archive marker. Otherwise, if the server // sends an Error after finishing the tarball, we will not notice it. - let mut ar = tar::Archive::new(copyreader); + let mut ar = tar::Archive::new(read_counter); ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index ce73dda08a..ce982863a5 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -71,6 +71,7 @@ pub struct ComputeMetrics { pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, pub basebackup_ms: u64, + pub basebackup_bytes: u64, pub config_ms: u64, pub total_startup_ms: u64, } diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 9c45088d62..c37dc7e069 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -6,6 +6,26 @@ from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.neon_fixtures import NeonEnvBuilder +@pytest.mark.xfail # We currently pass a 16MB pg_wal dir instead of creating it client-side +def test_basebackup_size(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + # Start + env.neon_cli.create_branch("test_startup") + endpoint = env.endpoints.create_start("test_startup") + + # Get metrics + metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json() + basebackup_bytes = metrics["basebackup_bytes"] + zenbenchmark.record( + "basebackup_size", basebackup_bytes, "bytes", report=MetricReport.LOWER_IS_BETTER + ) + + # Seems like a reasonable limit, but increase it if it becomes impossible to meet + assert basebackup_bytes < 70 * 1024 + + # Just start and measure duration. # # This test runs pretty quickly and can be informative when used in combination