mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-02 13:50:38 +00:00
Compare commits
18 Commits
v16-lr-wor
...
problame/n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2978c839c6 | ||
|
|
1fdde9e41e | ||
|
|
3c71003b7d | ||
|
|
a36be87680 | ||
|
|
39490ddd6c | ||
|
|
2b581eefa7 | ||
|
|
cedc0376ff | ||
|
|
b2ec54b8ac | ||
|
|
193ba2384a | ||
|
|
bcb8bed875 | ||
|
|
23e36ae6a3 | ||
|
|
194981c16b | ||
|
|
415b489f18 | ||
|
|
407c78cfaf | ||
|
|
800d3d1cee | ||
|
|
4204a7dd59 | ||
|
|
b602063f7f | ||
|
|
bcfd333c98 |
6
.github/workflows/build_and_test.yml
vendored
6
.github/workflows/build_and_test.yml
vendored
@@ -443,7 +443,7 @@ jobs:
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --cgroupns=private --security-opt umask=/sys/fs/cgroup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -456,6 +456,9 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Setup cgroup for use by test suite
|
||||
run: sudo bash -x /setup_neon_testsuite_cgroup.bash
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -472,6 +475,7 @@ jobs:
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
NEON_TEST_SUITE_USE_CGROUPS: /sys/fs/cgroup/neon_testsuite
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
RUN set -e \
|
||||
@@ -43,6 +40,7 @@ RUN set -e \
|
||||
openssh-client \
|
||||
parallel \
|
||||
pkg-config \
|
||||
sudo \
|
||||
unzip \
|
||||
wget \
|
||||
xz-utils \
|
||||
@@ -50,6 +48,17 @@ RUN set -e \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
RUN echo "#!/usr/bin/env bash \
|
||||
set -exuo pipefail \
|
||||
mkdir /sys/fs/cgroup/neon_testsuite \
|
||||
chown -R nonroot:nonroot /sys/fs/cgroup/neon_testsuite \
|
||||
echo SUCCESS: cgroup set up for user nonroot at /sys/fs/cgroup/neon_testsuite \
|
||||
" > /setup_neon_testsuite_cgroup.bash && chmod +x /setup_neon_testsuite_cgroup.bash
|
||||
RUN echo "ALL ALL = (ALL) NOPASSWD: /setup_neon_testsuite_cgroup.bash" >> /etc/sudoers
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
|
||||
@@ -438,7 +438,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
// TODO use background_process::stop_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
@@ -583,9 +583,21 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let child = cmd.spawn()?;
|
||||
// set up a scopeguard to kill & wait for the child in case we panic or bail below
|
||||
let child = scopeguard::guard(child, |mut child| {
|
||||
println!("SIGKILL & wait the started process");
|
||||
(|| {
|
||||
// TODO: use another signal that can be caught by the child so it can clean up any children it spawned
|
||||
child.kill().context("SIGKILL child")?;
|
||||
child.wait().context("wait() for child process")?;
|
||||
anyhow::Ok(())
|
||||
})()
|
||||
.with_context(|| format!("scopeguard kill&wait child {child:?}"))
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
// Write down the pid so we can wait for it when we want to stop
|
||||
// TODO use background_process::start_process instead
|
||||
// TODO use background_process::start_process instead: https://github.com/neondatabase/neon/pull/6482
|
||||
let pid = child.id();
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
std::fs::write(pidfile_path, pid.to_string())?;
|
||||
@@ -634,6 +646,9 @@ impl Endpoint {
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
drop(scopeguard::ScopeGuard::into_inner(child));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,11 +2,13 @@ from __future__ import annotations
|
||||
|
||||
import abc
|
||||
import asyncio
|
||||
import errno
|
||||
import filecmp
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import textwrap
|
||||
@@ -173,6 +175,35 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
|
||||
yield versioned_dir
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def test_cgroup_dir(request: FixtureRequest) -> Optional[Path]:
|
||||
## Use like so:
|
||||
# systemd-run --user --shell
|
||||
# cat /proc/self/cgroup
|
||||
# # will look like so: 0::/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service
|
||||
# env \
|
||||
# NEON_TEST_SUITE_USE_CGROUPS=/sys/fs/cgroup/user.slice/user-1000.slice/user@1000.service/app.slice/run-u5.service \
|
||||
# BUILD_TYPE=debug \
|
||||
# DEFAULT_PG_VERSION=15 \
|
||||
# ./scripts/pytest
|
||||
|
||||
var = os.getenv("NEON_TEST_SUITE_USE_CGROUPS")
|
||||
if var is None:
|
||||
return None
|
||||
root = Path(var)
|
||||
assert root.is_dir()
|
||||
assert (
|
||||
"cgroup2fs"
|
||||
== subprocess.check_output(
|
||||
["stat", "--file-system", "--format=%T", root],
|
||||
text=True,
|
||||
).strip()
|
||||
)
|
||||
test_cgroup_dir: Path = get_test_cgroup_dir(request, root)
|
||||
test_cgroup_dir.mkdir(exist_ok=False, parents=False)
|
||||
return test_cgroup_dir
|
||||
|
||||
|
||||
def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
|
||||
"""Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
|
||||
|
||||
@@ -447,6 +478,7 @@ class NeonEnvBuilder:
|
||||
initial_tenant: Optional[TenantId] = None,
|
||||
initial_timeline: Optional[TimelineId] = None,
|
||||
pageserver_virtual_file_io_engine: Optional[str] = None,
|
||||
test_cgroup_dir: Optional[Path] = None,
|
||||
):
|
||||
self.repo_dir = repo_dir
|
||||
self.rust_log_override = rust_log_override
|
||||
@@ -483,6 +515,7 @@ class NeonEnvBuilder:
|
||||
self.top_output_dir = top_output_dir
|
||||
|
||||
self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
|
||||
self.test_cgroup_dir = test_cgroup_dir
|
||||
|
||||
assert test_name.startswith(
|
||||
"test_"
|
||||
@@ -876,6 +909,15 @@ class NeonEnvBuilder:
|
||||
x.do_cleanup()
|
||||
|
||||
def __enter__(self) -> "NeonEnvBuilder":
|
||||
if self.test_cgroup_dir is not None:
|
||||
log.info(f"putting test runner into cgroup {self.test_cgroup_dir}")
|
||||
procs_file = self.test_cgroup_dir / "cgroup.procs"
|
||||
# TODO: auto-cleanup the cgroup, share code with __exit__
|
||||
pids = procs_file.read_text().split()
|
||||
assert pids == []
|
||||
mypid = os.getpid()
|
||||
with open(procs_file, "a") as f:
|
||||
f.write(f" {mypid}\n")
|
||||
return self
|
||||
|
||||
def __exit__(
|
||||
@@ -929,6 +971,57 @@ class NeonEnvBuilder:
|
||||
if cleanup_error is not None:
|
||||
cleanup_error = e
|
||||
|
||||
if self.test_cgroup_dir is not None:
|
||||
# TODO: ensure that we're the only python thread running;
|
||||
# otherwise, if a test leaks a thread,
|
||||
# our checking here would race with that other thread.
|
||||
# => https://github.com/neondatabase/neon/issues/6486
|
||||
log.info(f"check test runner cgroup for leaked processes: {self.test_cgroup_dir}")
|
||||
procs_file = self.test_cgroup_dir / "cgroup.procs"
|
||||
mypid = os.getpid()
|
||||
# move ourselves out of cgroup
|
||||
with open(self.test_cgroup_dir.parent / "cgroup.procs", "a") as f:
|
||||
f.write(f"{mypid}")
|
||||
# now freeze the test cgroup, so we can race-free list & SIGKILL the processes
|
||||
freeze_file = self.test_cgroup_dir / "cgroup.freeze"
|
||||
freeze_file.write_text("1")
|
||||
# inspect remaining pids
|
||||
pids = [int(pid) for pid in procs_file.read_text().split()]
|
||||
had_leaked_pids = len(pids) > 0
|
||||
for pid in pids:
|
||||
# dump info
|
||||
try:
|
||||
args_file = Path(f"/proc/{pid}/cmdline").read_bytes()
|
||||
args_parsed = [
|
||||
bytes.decode("utf-8") for bytes in args_file.split(b"\x00")
|
||||
] # NULL-byte separated
|
||||
args = f"{args_parsed}"
|
||||
except Exception as e:
|
||||
args = f"cmdline unavailable, {type(e)}, {e}"
|
||||
log.warning(f"SIGKILLing leaked process: {pid}: {args}")
|
||||
# send SIGKILL to the frozen process
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
# thaw the cgroup
|
||||
freeze_file.write_text("0")
|
||||
# wait for processes to exit
|
||||
while True:
|
||||
try:
|
||||
self.test_cgroup_dir.rmdir()
|
||||
break
|
||||
except OSError as e:
|
||||
if e.errno != errno.EBUSY:
|
||||
raise
|
||||
pids = [int(pid) for pid in procs_file.read_text().split()]
|
||||
if len(pids) == 0:
|
||||
break
|
||||
log.warning(f"waiting for pids to exit: {pids}")
|
||||
time.sleep(0.1)
|
||||
log.info("all pids exited and test cgroup has been deleted")
|
||||
cleanup_cmd = (
|
||||
f"for pid in $(cat '{procs_file}'); do; kill -9 $pid; done; rmdir {procs_file}"
|
||||
)
|
||||
assert not had_leaked_pids, f"had pids in test cgroup after NeonEnvBuilder teardown, see logs for details & cleanup help\ncleanup using bash command: {cleanup_cmd}\n"
|
||||
|
||||
|
||||
class NeonEnv:
|
||||
"""
|
||||
@@ -1200,6 +1293,7 @@ def _shared_simple_env(
|
||||
pg_distrib_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
test_cgroup_dir: Optional[Path],
|
||||
) -> Iterator[NeonEnv]:
|
||||
"""
|
||||
# Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES
|
||||
@@ -1230,6 +1324,7 @@ def _shared_simple_env(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
|
||||
test_cgroup_dir=test_cgroup_dir,
|
||||
) as builder:
|
||||
env = builder.init_start()
|
||||
|
||||
@@ -1269,6 +1364,7 @@ def neon_env_builder(
|
||||
test_overlay_dir: Path,
|
||||
top_output_dir: Path,
|
||||
pageserver_virtual_file_io_engine: str,
|
||||
test_cgroup_dir: Optional[Path],
|
||||
) -> Iterator[NeonEnvBuilder]:
|
||||
"""
|
||||
Fixture to create a Neon environment for test.
|
||||
@@ -1302,6 +1398,7 @@ def neon_env_builder(
|
||||
test_name=request.node.name,
|
||||
test_output_dir=test_output_dir,
|
||||
test_overlay_dir=test_overlay_dir,
|
||||
test_cgroup_dir=test_cgroup_dir,
|
||||
) as builder:
|
||||
yield builder
|
||||
|
||||
@@ -3636,6 +3733,10 @@ def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
return get_test_output_dir(request, top_output_dir) / "repo"
|
||||
|
||||
|
||||
def get_test_cgroup_dir(request: FixtureRequest, top_cgroup_dir: Path) -> Path:
|
||||
return _get_test_dir(request, top_cgroup_dir, "")
|
||||
|
||||
|
||||
def pytest_addoption(parser: Parser):
|
||||
parser.addoption(
|
||||
"--preserve-database-files",
|
||||
|
||||
@@ -24,7 +24,8 @@ from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import subprocess_capture
|
||||
|
||||
|
||||
def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
|
||||
# fixture order matters here: neon_env_builder leaked process assertions
|
||||
def test_import_from_vanilla(test_output_dir, pg_bin, neon_env_builder, vanilla_pg):
|
||||
# Put data in vanilla pg
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
|
||||
|
||||
@@ -59,3 +59,5 @@ def test_neon_two_primary_endpoints_fail(
|
||||
env.neon_cli.endpoint_stop("ep1")
|
||||
# ep1 is stopped so create ep2 will succeed
|
||||
env.neon_cli.endpoint_start("ep2")
|
||||
# cleanup
|
||||
env.neon_cli.endpoint_stop("ep2")
|
||||
|
||||
Reference in New Issue
Block a user