From 07b7c63975fbfaf60f28176b275c4d57e28a8e04 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 21 Aug 2024 21:26:27 +0300
Subject: [PATCH] test: avoid some too long shutdowns by flushing before
 shutdown (#8772)

After #8655, we needed to mark some tests to shut down immediately. To
aid these tests, try the new pattern of `flush_ep_to_pageserver`
followed by a non-compacting checkpoint. This moves the general graceful
shutdown problem of having too much to flush at shutdown into the test.
Also, add logging for how long the graceful shutdown took, if we got to
complete it for faster log eyeballing.

Fixes: #8712
Cc: #8715, #8708
---
 pageserver/src/lib.rs                         |  7 +++++-
 .../pagebench/test_ondemand_download_churn.py | 17 +++++++-------
 test_runner/performance/test_layer_map.py     | 23 +++++++++++--------
 test_runner/regress/test_combocid.py          | 20 +++++++++++++---
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5829a1c188..dbfc9f3544 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
 ) {
     use std::time::Duration;
 
+    let started_at = std::time::Instant::now();
+
     // If the orderly shutdown below takes too long, we still want to make
     // sure that all walredo processes are killed and wait()ed on by us, not systemd.
     //
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
     walredo_extraordinary_shutdown_thread.join().unwrap();
     info!("walredo_extraordinary_shutdown_thread done");
 
-    info!("Shut down successfully completed");
+    info!(
+        elapsed_ms = started_at.elapsed().as_millis(),
+        "Shut down successfully completed"
+    );
     std::process::exit(exit_code);
 }
 
diff --git a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
index 0348b08f04..9ad6e7907c 100644
--- a/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
+++ b/test_runner/performance/pageserver/pagebench/test_ondemand_download_churn.py
@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
-from fixtures.pageserver.utils import wait_for_upload_queue_empty
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    flush_ep_to_pageserver,
+)
 from fixtures.remote_storage import s3_storage
 from fixtures.utils import humantime_to_ms
 
@@ -62,9 +66,6 @@ def test_download_churn(
 
     run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
-
 
 def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     remote_storage_kind = s3_storage()
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)})  as i",
             options="-c statement_timeout=0",
         )
-        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
-    # TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
-    wait_for_upload_queue_empty(client, tenant_id, timeline_id)
+        flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
+
+    client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
 
     return env
 
diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py
index 890b70b9fc..bc6d9de346 100644
--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -1,20 +1,21 @@
 import time
 
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
-#
-# Benchmark searching the layer map, when there are a lot of small layer files.
-#
 def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
-    env = neon_env_builder.init_start()
+    """Benchmark searching the layer map, when there are a lot of small layer files."""
+
+    env = neon_env_builder.init_configs()
     n_iters = 10
     n_records = 100000
 
+    env.start()
+
     # We want to have a lot of lot of layer files to exercise the layer map. Disable
     # GC, and make checkpoint_distance very small, so that we get a lot of small layer
     # files.
-    tenant, _ = env.neon_cli.create_tenant(
+    tenant, timeline = env.neon_cli.create_tenant(
         conf={
             "gc_period": "0s",
             "checkpoint_distance": "16384",
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         }
     )
 
-    env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant)
     cur = endpoint.connect().cursor()
     cur.execute("create table t(x integer)")
     for _ in range(n_iters):
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
         time.sleep(1)
 
     cur.execute("vacuum t")
+
     with zenbenchmark.record_duration("test_query"):
         cur.execute("SELECT count(*) from t")
         assert cur.fetchone() == (n_iters * n_records,)
 
-    # see https://github.com/neondatabase/neon/issues/8712
-    env.stop(immediate=True)
+    flush_ep_to_pageserver(env, endpoint, tenant, timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        tenant, timeline, compact=False, wait_until_uploaded=True
+    )
diff --git a/test_runner/regress/test_combocid.py b/test_runner/regress/test_combocid.py
index 6d2567b7ee..41907b1f20 100644
--- a/test_runner/regress/test_combocid.py
+++ b/test_runner/regress/test_combocid.py
@@ -1,4 +1,4 @@
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
 
 
 def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
 
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
     assert len(rows) == 500
 
     cur.execute("rollback")
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
 
 
 def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
     cur.execute("delete from t")
     # Clear the cache, so that we exercise reconstructing the pages
     # from WAL
-    cur.execute("SELECT clear_buffer_cache()")
+    endpoint.clear_shared_buffers()
 
     # Check that the cursor opened earlier still works. If the
     # combocids are not restored correctly, it won't.
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
 
     cur.execute("rollback")
 
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )
+
 
 def test_combocid(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
     assert cur.rowcount == n_records
 
     cur.execute("rollback")
+
+    flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
+    env.pageserver.http_client().timeline_checkpoint(
+        env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
+    )