walredo: fix zombie processes ([postgres] <defunct>)

This change wraps the std::process:Child that we spawn for WAL redo
into a type that ensures that we try to SIGKILL + waitpid() on it.

If there is no explicit call to kill_and_wait(), the Drop implementation
will spawns a task that does it in the BACKGROUND_RUNTIME.
That's an ugly hack but I think it's better than doing kill+wait
synchronously from Drop, since I think the general assumption in the
Rust ecosystem is that Drop doesn't block.
Especially since the drop sites can be _any_ place that drops the last
Arc<PostgresRedoManager>, e.g., compaction or GC.

The benefit of having the new type over just adding a Drop impl to
PostgresRedoProcess is that we can construct it earlier than the full
PostgresRedoProcess in PostgresRedoProcess::launch().
That allows us to correctly kill+wait the child if there is an error in
PostgresRedoProcess::launch() after spawning it.

I also took a stab at a regression test. I manually verified
that it fails before the fix to walredo.rs.

fixes https://github.com/neondatabase/neon/issues/2761
closes https://github.com/neondatabase/neon/pull/2776
This commit is contained in:
Christian Schwarz
2022-11-08 16:44:17 +01:00
committed by Christian Schwarz
parent f720dd735e
commit 8654e95fae
4 changed files with 261 additions and 59 deletions

View File

@@ -0,0 +1,104 @@
import time
import psutil
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException
from fixtures.types import TenantId
def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_present=False):
children = psutil.Process(pageserver_pid).children()
for child in children:
if not wal_redo_present:
assert "--wal-redo" not in child.cmdline()
if not defunct_present:
assert child.status() != psutil.STATUS_ZOMBIE
# Check that the pageserver doesn't leave behind WAL redo processes
# when a tenant is detached. We had an issue previously where we failed
# to wait and consume the exit code of the WAL redo process, leaving it behind
# as a zombie process.
def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False)
# first check for non existing tenant
tenant_id = TenantId.generate()
with pytest.raises(
expected_exception=PageserverApiException,
match=f"Tenant not found for id {tenant_id}",
):
pageserver_http.tenant_detach(tenant_id)
# create new nenant
tenant_id, _ = env.neon_cli.create_tenant()
# assert tenant exists on disk
assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
pg = env.postgres.create_start("main", tenant_id=tenant_id)
pg_conn = pg.connect()
cur = pg_conn.cursor()
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
# from shared_buffers without hitting the page server, which defeats the point
# of this test.
cur.execute("CREATE TABLE foo (t text)")
cur.execute(
"""
INSERT INTO foo
SELECT 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Verify that the table is larger than shared_buffers
cur.execute(
"""
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_ize
from pg_settings where name = 'shared_buffers'
"""
)
row = cur.fetchone()
assert row is not None
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
assert int(row[0]) < int(row[1])
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)
# After filling the table and doing the SELECT, it is guaranteed that we did some WAL redo.
# So, assert that the WAL redo process is present.
# XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail
assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False)
last_error = None
for i in range(3):
try:
pageserver_http.tenant_detach(tenant_id)
except Exception as e:
last_error = e
log.error(f"try {i} error detaching tenant: {e}")
continue
else:
break
# else is called if the loop finished without reaching "break"
else:
pytest.fail(f"could not detach tenant: {last_error}")
# check that nothing is left on disk for deleted tenant
assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
# Pageserver schedules kill+wait of the WAL redo process to the background runtime,
# asynchronously to tenant detach. Cut it some slack to complete kill+wait before
# checking.
time.sleep(1.0)
assert_child_processes(pagserver_pid, wal_redo_present=False, defunct_present=False)