Compare commits

...

18 Commits

Author SHA1 Message Date
Christian Schwarz
2978c839c6 avoid --privileged and blanket passwdless sudo 2024-01-26 13:04:10 +00:00
Christian Schwarz
1fdde9e41e fixup cedc0376ff 2024-01-26 10:26:03 +00:00
Christian Schwarz
3c71003b7d Revert "[DO NOT MERGE] build only debug build for v14"
This reverts commit 800d3d1cee.
2024-01-26 10:18:45 +00:00
Christian Schwarz
a36be87680 Merge remote-tracking branch 'origin/main' into problame/neon-env-builder-cgroup 2024-01-26 10:17:25 +00:00
Christian Schwarz
39490ddd6c fix other detected process leakage in the lowest-effort-way possible 2024-01-26 10:11:08 +00:00
Christian Schwarz
2b581eefa7 add todo to protect against race condition with leaked threads 2024-01-26 10:09:35 +00:00
Christian Schwarz
cedc0376ff fix(neon_local): leaks compute_ctl child process if get_status() fails
Copy-pasting from #6474 here; as multiple TODO comments in this file
indicate, we should really be using background_process::start_process
for compute_ctl => https://github.com/neondatabase/neon/pull/6482
2024-01-26 10:08:33 +00:00
Christian Schwarz
b2ec54b8ac Revert "run tests sequentially"
This reverts commit 407c78cfaf.
2024-01-25 20:31:20 +00:00
Christian Schwarz
193ba2384a Revert "[DO NOT MERGE] fail fast"
This reverts commit 194981c16b.
2024-01-25 20:31:04 +00:00
Christian Schwarz
bcb8bed875 Revert "guarantee leaked processes so we know this actually works"
This reverts commit 4204a7dd59.
2024-01-25 19:53:09 +00:00
Christian Schwarz
23e36ae6a3 see if this fixes the permission denied issue 2024-01-25 19:02:35 +00:00
Christian Schwarz
194981c16b [DO NOT MERGE] fail fast 2024-01-25 19:01:04 +00:00
Christian Schwarz
415b489f18 neon_simple_env was not using test_cgroup_dir 2024-01-25 18:59:41 +00:00
Christian Schwarz
407c78cfaf run tests sequentially 2024-01-25 16:27:10 +00:00
Alexander Bayandin
800d3d1cee [DO NOT MERGE] build only debug build for v14 2024-01-25 16:20:55 +00:00
Christian Schwarz
4204a7dd59 guarantee leaked processes so we know this actually works 2024-01-25 13:05:29 +00:00
Christian Schwarz
b602063f7f attempt to make it work in CI 2024-01-25 13:05:29 +00:00
Christian Schwarz
bcfd333c98 feat(test suite): use cgroups to detect if a test leaks processes
Tested manually by commenting out NeonEnv.stop()'s self.attachment_service.stop() call.
2024-01-25 13:05:29 +00:00
6 changed files with 139 additions and 7 deletions

View File

@@ -443,7 +443,7 @@ jobs:
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --cgroupns=private --security-opt umask=/sys/fs/cgroup
strategy:
fail-fast: false
matrix:
@@ -456,6 +456,9 @@ jobs:
submodules: true
fetch-depth: 1
- name: Setup cgroup for use by test suite
run: sudo bash -x /setup_neon_testsuite_cgroup.bash
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
with:
@@ -472,6 +475,7 @@ jobs:
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
NEON_TEST_SUITE_USE_CGROUPS: /sys/fs/cgroup/neon_testsuite
- name: Merge and upload coverage data
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'

View File

@@ -1,8 +1,5 @@
FROM debian:bullseye-slim
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
# System deps
RUN set -e \
@@ -43,6 +40,7 @@ RUN set -e \
openssh-client \
parallel \
pkg-config \
sudo \
unzip \
wget \
xz-utils \
@@ -50,6 +48,17 @@ RUN set -e \
zstd \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Add nonroot user
RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
RUN echo "#!/usr/bin/env bash \
set -exuo pipefail \
mkdir /sys/fs/cgroup/neon_testsuite \
chown -R nonroot:nonroot /sys/fs/cgroup/neon_testsuite \
echo SUCCESS: cgroup set up for user nonroot at /sys/fs/cgroup/neon_testsuite \
" > /setup_neon_testsuite_cgroup.bash && chmod +x /setup_neon_testsuite_cgroup.bash
RUN echo "ALL ALL = (ALL) NOPASSWD: /setup_neon_testsuite_cgroup.bash" >> /etc/sudoers
# protobuf-compiler (protoc)
ENV PROTOC_VERSION 25.1
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \

View File

@@ -438,7 +438,7 @@ impl Endpoint {
}
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
// TODO use background_process::stop_process instead
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
let pid = nix::unistd::Pid::from_raw(pid as i32);
@@ -583,9 +583,21 @@ impl Endpoint {
}
let child = cmd.spawn()?;
// set up a scopeguard to kill & wait for the child in case we panic or bail below
let child = scopeguard::guard(child, |mut child| {
println!("SIGKILL & wait the started process");
(|| {
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
child.kill().context("SIGKILL child")?;
child.wait().context("wait() for child process")?;
anyhow::Ok(())
})()
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
.unwrap();
});
// Write down the pid so we can wait for it when we want to stop
// TODO use background_process::start_process instead
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
let pid = child.id();
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
std::fs::write(pidfile_path, pid.to_string())?;
@@ -634,6 +646,9 @@ impl Endpoint {
std::thread::sleep(ATTEMPT_INTERVAL);
}
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
drop(scopeguard::ScopeGuard::into_inner(child));
Ok(())
}

View File

@@ -2,11 +2,13 @@ from __future__ import annotations
import abc
import asyncio
import errno
import filecmp
import json
import os
import re
import shutil
import signal
import subprocess
import tempfile
import textwrap
@@ -173,6 +175,35 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
yield versioned_dir
@pytest.fixture(scope="function")
def test_cgroup_dir(request: FixtureRequest) -> Optional[Path]:
## Use like so:
# systemd-run --user --shell
# cat /proc/self/cgroup
# # will look like so: 0::/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service
# env \
# NEON_TEST_SUITE_USE_CGROUPS=/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service \
# BUILD_TYPE=debug \
# DEFAULT_PG_VERSION=15 \
# ./scripts/pytest
var = os.getenv("NEON_TEST_SUITE_USE_CGROUPS")
if var is None:
return None
root = Path(var)
assert root.is_dir()
assert (
"cgroup2fs"
== subprocess.check_output(
["stat", "--file-system", "--format=%T", root],
text=True,
).strip()
)
test_cgroup_dir: Path = get_test_cgroup_dir(request, root)
test_cgroup_dir.mkdir(exist_ok=False, parents=False)
return test_cgroup_dir
def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
"""Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
@@ -447,6 +478,7 @@ class NeonEnvBuilder:
initial_tenant: Optional[TenantId] = None,
initial_timeline: Optional[TimelineId] = None,
pageserver_virtual_file_io_engine: Optional[str] = None,
test_cgroup_dir: Optional[Path] = None,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -483,6 +515,7 @@ class NeonEnvBuilder:
self.top_output_dir = top_output_dir
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
self.test_cgroup_dir = test_cgroup_dir
assert test_name.startswith(
"test_"
@@ -876,6 +909,15 @@ class NeonEnvBuilder:
x.do_cleanup()
def __enter__(self) -> "NeonEnvBuilder":
if self.test_cgroup_dir is not None:
log.info(f"putting test runner into cgroup {self.test_cgroup_dir}")
procs_file = self.test_cgroup_dir / "cgroup.procs"
# TODO: auto-cleanup the cgroup, share code with __exit__
pids = procs_file.read_text().split()
assert pids == []
mypid = os.getpid()
with open(procs_file, "a") as f:
f.write(f" {mypid}\n")
return self
def __exit__(
@@ -929,6 +971,57 @@ class NeonEnvBuilder:
if cleanup_error is not None:
cleanup_error = e
if self.test_cgroup_dir is not None:
# TODO: ensure that we're the only python thread running;
# otherwise, if a test leaks a thread,
# our checking here would race with that other thread.
# => https://github.com/neondatabase/neon/issues/6486
log.info(f"check test runner cgroup for leaked processes: {self.test_cgroup_dir}")
procs_file = self.test_cgroup_dir / "cgroup.procs"
mypid = os.getpid()
# move ourselves out of cgroup
with open(self.test_cgroup_dir.parent / "cgroup.procs", "a") as f:
f.write(f"{mypid}")
# now freeze the test cgroup, so we can race-free list & SIGKILL the processes
freeze_file = self.test_cgroup_dir / "cgroup.freeze"
freeze_file.write_text("1")
# inspect remaining pids
pids = [int(pid) for pid in procs_file.read_text().split()]
had_leaked_pids = len(pids) > 0
for pid in pids:
# dump info
try:
args_file = Path(f"/proc/{pid}/cmdline").read_bytes()
args_parsed = [
bytes.decode("utf-8") for bytes in args_file.split(b"\x00")
] # NULL-byte separated
args = f"{args_parsed}"
except Exception as e:
args = f"cmdline unavailable, {type(e)}, {e}"
log.warning(f"SIGKILLing leaked process: {pid}: {args}")
# send SIGKILL to the frozen process
os.kill(pid, signal.SIGKILL)
# thaw the cgroup
freeze_file.write_text("0")
# wait for processes to exit
while True:
try:
self.test_cgroup_dir.rmdir()
break
except OSError as e:
if e.errno != errno.EBUSY:
raise
pids = [int(pid) for pid in procs_file.read_text().split()]
if len(pids) == 0:
break
log.warning(f"waiting for pids to exit: {pids}")
time.sleep(0.1)
log.info("all pids exited and test cgroup has been deleted")
cleanup_cmd = (
f"for pid in $(cat '{procs_file}'); do; kill -9 $pid; done; rmdir {procs_file}"
)
assert not had_leaked_pids, f"had pids in test cgroup after NeonEnvBuilder teardown, see logs for details & cleanup help\ncleanup using bash command: {cleanup_cmd}\n"
class NeonEnv:
"""
@@ -1200,6 +1293,7 @@ def _shared_simple_env(
pg_distrib_dir: Path,
pg_version: PgVersion,
pageserver_virtual_file_io_engine: str,
test_cgroup_dir: Optional[Path],
) -> Iterator[NeonEnv]:
"""
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
@@ -1230,6 +1324,7 @@ def _shared_simple_env(
test_name=request.node.name,
test_output_dir=test_output_dir,
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
test_cgroup_dir=test_cgroup_dir,
) as builder:
env = builder.init_start()
@@ -1269,6 +1364,7 @@ def neon_env_builder(
test_overlay_dir: Path,
top_output_dir: Path,
pageserver_virtual_file_io_engine: str,
test_cgroup_dir: Optional[Path],
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -1302,6 +1398,7 @@ def neon_env_builder(
test_name=request.node.name,
test_output_dir=test_output_dir,
test_overlay_dir=test_overlay_dir,
test_cgroup_dir=test_cgroup_dir,
) as builder:
yield builder
@@ -3636,6 +3733,10 @@ def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
return get_test_output_dir(request, top_output_dir) / "repo"
def get_test_cgroup_dir(request: FixtureRequest, top_cgroup_dir: Path) -> Path:
return _get_test_dir(request, top_cgroup_dir, "")
def pytest_addoption(parser: Parser):
parser.addoption(
"--preserve-database-files",

View File

@@ -24,7 +24,8 @@ from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import subprocess_capture
def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
# fixture order matters here: neon_env_builder leaked process assertions
def test_import_from_vanilla(test_output_dir, pg_bin, neon_env_builder, vanilla_pg):
# Put data in vanilla pg
vanilla_pg.start()
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")

View File

@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
env.neon_cli.endpoint_stop("ep1")
# ep1 is stopped so create ep2 will succeed
env.neon_cli.endpoint_start("ep2")
# cleanup
env.neon_cli.endpoint_stop("ep2")