Compare commits

...

8 Commits

Author SHA1 Message Date
Arpad Müller
d38bc02bdf Use killpg and pgroups to recursively kill everything 2024-01-23 02:01:02 +01:00
Arpad Müller
24bc6ddec4 Just add this wait to prevent a race 2024-01-23 01:58:57 +01:00
Arpad Müller
f49fe734d1 Allow this for a test 2024-01-23 00:28:22 +01:00
Arpad Müller
872e645f7d Disable cancellation support in initdb 2024-01-22 23:47:08 +01:00
Arpad Müller
648fe7c92d Add it to the allowed errors 2024-01-22 18:26:47 +01:00
Arpad Müller
21045477a3 Allow this msg 2024-01-22 15:20:18 +01:00
Arpad Müller
125f24ca49 exit initdb via kill and then await it 2024-01-22 15:19:13 +01:00
Arpad Müller
443d4ce868 Duplicate the test to try to reproduce the issue 2024-01-22 15:19:13 +01:00
2 changed files with 248 additions and 5 deletions

View File

@@ -18,13 +18,16 @@ use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::FutureExt;
use futures::StreamExt;
use nix::unistd::Pid;
use pageserver_api::models;
use pageserver_api::models::TimelineState;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::Signal;
use std::fmt;
use std::os::unix::process::CommandExt;
use storage_broker::BrokerClientChannel;
use tokio::io::BufReader;
use tokio::runtime::Handle;
@@ -3746,7 +3749,17 @@ async fn run_initdb(
let _permit = INIT_DB_SEMAPHORE.acquire().await;
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
let mut initdb_command_std = std::process::Command::new(&initdb_bin_path);
// The process_group function is unstable as tokio's MSRV is 1.63,
// and process_group was stabilized in 1.64. This is the officially
// recommended workaround.
// Setting pgroup to 0 makes the pgroupid be that of the child, as explained in
// https://github.com/microsoft/WSL/issues/2997 (unrelated bug, but explains it)
// We use need the pgid to be set for pkill to work during cancellation, to also
// get the child processes of initdb.
initdb_command_std.process_group(0);
let mut initdb_command = tokio::process::Command::from(initdb_command_std)
.args(["-D", initdb_target_dir.as_ref()])
.args(["-U", &conf.superuser])
.args(["-E", "utf8"])
@@ -3767,13 +3780,25 @@ async fn run_initdb(
.spawn()?;
tokio::select! {
initdb_output = initdb_command.wait_with_output() => {
let initdb_output = initdb_output?;
if !initdb_output.status.success() {
return Err(InitdbError::Failed(initdb_output.status, initdb_output.stderr));
exit_status = initdb_command.wait() => {
let exit_status = exit_status?;
if !exit_status.success() {
let mut stderr = initdb_command.stderr.take().unwrap();
let mut stderr_vec = Vec::new();
tokio::io::copy(&mut stderr, &mut stderr_vec).await?;
return Err(InitdbError::Failed(exit_status, stderr_vec));
}
}
_ = cancel.cancelled() => {
if let Some(pid) = initdb_command.id() {
warn!("Doing killpg...");
nix::sys::signal::killpg(Pid::from_raw(pid as i32), Signal::SIGKILL)
.map_err(|e| InitdbError::Other(anyhow::anyhow!(e)))?;
initdb_command.wait().await?;
} else {
warn!("Couldn't obtain initdb pid, killing initdb process only.");
initdb_command.kill().await?;
}
return Err(InitdbError::Cancelled);
}
}

View File

@@ -556,6 +556,216 @@ def test_tenant_delete_concurrent(
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 0
def test_tenant_delete_races_timeline_creation_01(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_02(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_03(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_04(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_05(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_06(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_07(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_08(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_09(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_10(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_11(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_12(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_13(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_14(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_15(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_16(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_17(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_18(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_19(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_20(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_21(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_22(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_23(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_24(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_25(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_26(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_27(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_28(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_29(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation_30(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
test_tenant_delete_races_timeline_creation(neon_env_builder, pg_bin)
def test_tenant_delete_races_timeline_creation(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
@@ -578,6 +788,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*Cancelled request finished with an error: InternalServerError\\(.*ancelled"
)
# This can occur sometimes.
CONFLICT_MESSAGE = ".*Precondition failed: Invalid state Stopping. Expected Active or Broken.*"
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
@@ -586,6 +799,9 @@ def test_tenant_delete_races_timeline_creation(
".*POST.*/timeline.* request was dropped before completing",
# Timeline creation runs into this error
CANCELLED_ERROR,
# Timeline deletion can run into this error during deletion
CONFLICT_MESSAGE,
".*tenant_delete_handler.*still waiting, taking longer than expected.*",
]
)
@@ -643,6 +859,8 @@ def test_tenant_delete_races_timeline_creation(
except PageserverApiException:
pass
os.wait(4)
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,