test: avoid some too long shutdowns by flushing before shutdown (#8772)

After #8655, we needed to mark some tests to shut down immediately. To
aid these tests, try the new pattern of `flush_ep_to_pageserver`
followed by a non-compacting checkpoint. This moves the general graceful
shutdown problem of having too much to flush at shutdown into the test.
Also, add logging for how long the graceful shutdown took, if we got to
complete it for faster log eyeballing.

Fixes: #8712
Cc: #8715, #8708
This commit is contained in:
Joonas Koivunen
2024-08-21 21:26:27 +03:00
committed by GitHub
parent 04752dfa75
commit 07b7c63975
4 changed files with 45 additions and 22 deletions

View File

@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
) {
use std::time::Duration;
let started_at = std::time::Instant::now();
// If the orderly shutdown below takes too long, we still want to make
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
//
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
walredo_extraordinary_shutdown_thread.join().unwrap();
info!("walredo_extraordinary_shutdown_thread done");
info!("Shut down successfully completed");
info!(
elapsed_ms = started_at.elapsed().as_millis(),
"Shut down successfully completed"
);
std::process::exit(exit_code);
}

View File

@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
from fixtures.pageserver.utils import wait_for_upload_queue_empty
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
flush_ep_to_pageserver,
)
from fixtures.remote_storage import s3_storage
from fixtures.utils import humantime_to_ms
@@ -62,9 +66,6 @@ def test_download_churn(
run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
# see https://github.com/neondatabase/neon/issues/8712
env.stop(immediate=True)
def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
remote_storage_kind = s3_storage()
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i",
options="-c statement_timeout=0",
)
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
# TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
wait_for_upload_queue_empty(client, tenant_id, timeline_id)
flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
return env

View File

@@ -1,20 +1,21 @@
import time
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
#
# Benchmark searching the layer map, when there are a lot of small layer files.
#
def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
env = neon_env_builder.init_start()
"""Benchmark searching the layer map, when there are a lot of small layer files."""
env = neon_env_builder.init_configs()
n_iters = 10
n_records = 100000
env.start()
# We want to have a lot of lot of layer files to exercise the layer map. Disable
# GC, and make checkpoint_distance very small, so that we get a lot of small layer
# files.
tenant, _ = env.neon_cli.create_tenant(
tenant, timeline = env.neon_cli.create_tenant(
conf={
"gc_period": "0s",
"checkpoint_distance": "16384",
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
}
)
env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
cur = endpoint.connect().cursor()
cur.execute("create table t(x integer)")
for _ in range(n_iters):
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
time.sleep(1)
cur.execute("vacuum t")
with zenbenchmark.record_duration("test_query"):
cur.execute("SELECT count(*) from t")
assert cur.fetchone() == (n_iters * n_records,)
# see https://github.com/neondatabase/neon/issues/8712
env.stop(immediate=True)
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
env.pageserver.http_client().timeline_checkpoint(
tenant, timeline, compact=False, wait_until_uploaded=True
)

View File

@@ -1,4 +1,4 @@
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
# Clear the cache, so that we exercise reconstructing the pages
# from WAL
cur.execute("SELECT clear_buffer_cache()")
endpoint.clear_shared_buffers()
# Check that the cursor opened earlier still works. If the
# combocids are not restored correctly, it won't.
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
assert len(rows) == 500
cur.execute("rollback")
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
env.pageserver.http_client().timeline_checkpoint(
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
)
def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
cur.execute("delete from t")
# Clear the cache, so that we exercise reconstructing the pages
# from WAL
cur.execute("SELECT clear_buffer_cache()")
endpoint.clear_shared_buffers()
# Check that the cursor opened earlier still works. If the
# combocids are not restored correctly, it won't.
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
cur.execute("rollback")
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
env.pageserver.http_client().timeline_checkpoint(
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
)
def test_combocid(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
assert cur.rowcount == n_records
cur.execute("rollback")
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
env.pageserver.http_client().timeline_checkpoint(
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
)