mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-01 04:20:39 +00:00
test: avoid some too long shutdowns by flushing before shutdown (#8772)
After #8655, we needed to mark some tests to shut down immediately. To aid these tests, try the new pattern of `flush_ep_to_pageserver` followed by a non-compacting checkpoint. This moves the general graceful shutdown problem of having too much to flush at shutdown into the test. Also, add logging for how long the graceful shutdown took, if we got to complete it for faster log eyeballing. Fixes: #8712 Cc: #8715, #8708
This commit is contained in:
@@ -88,6 +88,8 @@ pub async fn shutdown_pageserver(
|
||||
) {
|
||||
use std::time::Duration;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// If the orderly shutdown below takes too long, we still want to make
|
||||
// sure that all walredo processes are killed and wait()ed on by us, not systemd.
|
||||
//
|
||||
@@ -241,7 +243,10 @@ pub async fn shutdown_pageserver(
|
||||
walredo_extraordinary_shutdown_thread.join().unwrap();
|
||||
info!("walredo_extraordinary_shutdown_thread done");
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
info!(
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"Shut down successfully completed"
|
||||
);
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,8 +5,12 @@ from typing import Any, Dict, Tuple
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
||||
from fixtures.pageserver.utils import wait_for_upload_queue_empty
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
flush_ep_to_pageserver,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
from fixtures.utils import humantime_to_ms
|
||||
|
||||
@@ -62,9 +66,6 @@ def test_download_churn(
|
||||
|
||||
run_benchmark(env, pg_bin, record, io_engine, concurrency_per_target, duration)
|
||||
|
||||
# see https://github.com/neondatabase/neon/issues/8712
|
||||
env.stop(immediate=True)
|
||||
|
||||
|
||||
def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
remote_storage_kind = s3_storage()
|
||||
@@ -98,9 +99,9 @@ def setup_env(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
f"INSERT INTO data SELECT lpad(i::text, {bytes_per_row}, '0') FROM generate_series(1, {int(nrows)}) as i",
|
||||
options="-c statement_timeout=0",
|
||||
)
|
||||
wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
|
||||
# TODO: this is a bit imprecise, there could be frozen layers being written out that we don't observe here
|
||||
wait_for_upload_queue_empty(client, tenant_id, timeline_id)
|
||||
flush_ep_to_pageserver(env, ep, tenant_id, timeline_id)
|
||||
|
||||
client.timeline_checkpoint(tenant_id, timeline_id, compact=False, wait_until_uploaded=True)
|
||||
|
||||
return env
|
||||
|
||||
|
||||
@@ -1,20 +1,21 @@
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
|
||||
|
||||
|
||||
#
|
||||
# Benchmark searching the layer map, when there are a lot of small layer files.
|
||||
#
|
||||
def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
env = neon_env_builder.init_start()
|
||||
"""Benchmark searching the layer map, when there are a lot of small layer files."""
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
n_iters = 10
|
||||
n_records = 100000
|
||||
|
||||
env.start()
|
||||
|
||||
# We want to have a lot of lot of layer files to exercise the layer map. Disable
|
||||
# GC, and make checkpoint_distance very small, so that we get a lot of small layer
|
||||
# files.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
tenant, timeline = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"checkpoint_distance": "16384",
|
||||
@@ -24,8 +25,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
}
|
||||
)
|
||||
|
||||
env.neon_cli.create_timeline("test_layer_map", tenant_id=tenant)
|
||||
endpoint = env.endpoints.create_start("test_layer_map", tenant_id=tenant)
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
|
||||
cur = endpoint.connect().cursor()
|
||||
cur.execute("create table t(x integer)")
|
||||
for _ in range(n_iters):
|
||||
@@ -33,9 +33,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
time.sleep(1)
|
||||
|
||||
cur.execute("vacuum t")
|
||||
|
||||
with zenbenchmark.record_duration("test_query"):
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (n_iters * n_records,)
|
||||
|
||||
# see https://github.com/neondatabase/neon/issues/8712
|
||||
env.stop(immediate=True)
|
||||
flush_ep_to_pageserver(env, endpoint, tenant, timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
tenant, timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, flush_ep_to_pageserver
|
||||
|
||||
|
||||
def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
|
||||
@@ -34,7 +34,7 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
|
||||
|
||||
# Clear the cache, so that we exercise reconstructing the pages
|
||||
# from WAL
|
||||
cur.execute("SELECT clear_buffer_cache()")
|
||||
endpoint.clear_shared_buffers()
|
||||
|
||||
# Check that the cursor opened earlier still works. If the
|
||||
# combocids are not restored correctly, it won't.
|
||||
@@ -43,6 +43,10 @@ def do_combocid_op(neon_env_builder: NeonEnvBuilder, op):
|
||||
assert len(rows) == 500
|
||||
|
||||
cur.execute("rollback")
|
||||
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
|
||||
def test_combocid_delete(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -92,7 +96,7 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("delete from t")
|
||||
# Clear the cache, so that we exercise reconstructing the pages
|
||||
# from WAL
|
||||
cur.execute("SELECT clear_buffer_cache()")
|
||||
endpoint.clear_shared_buffers()
|
||||
|
||||
# Check that the cursor opened earlier still works. If the
|
||||
# combocids are not restored correctly, it won't.
|
||||
@@ -102,6 +106,11 @@ def test_combocid_multi_insert(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
cur.execute("rollback")
|
||||
|
||||
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
|
||||
def test_combocid(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -137,3 +146,8 @@ def test_combocid(neon_env_builder: NeonEnvBuilder):
|
||||
assert cur.rowcount == n_records
|
||||
|
||||
cur.execute("rollback")
|
||||
|
||||
flush_ep_to_pageserver(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
env.pageserver.http_client().timeline_checkpoint(
|
||||
env.initial_tenant, env.initial_timeline, compact=False, wait_until_uploaded=True
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user