Add close_fds for initdb command and add close fd test (#2060)

This PR adds a test for https://github.com/neondatabase/neon/pull/1834 and fixes the error in https://app.circleci.com/pipelines/github/neondatabase/neon/7753/workflows/94d1b796-10a3-4989-b23c-4c1eb4a49cf5/jobs/79586, which happens because `pageserver.pid` is held by `initdb` command on restart.

Because the test requires `lsof` to be installed in the docker image, this PR also updates the caches and docker image specified in CircleCI config file.
This commit is contained in:
Thang Pham
2022-07-12 15:04:40 -04:00
committed by GitHub
parent 5cf94a5848
commit 7f048abf3b
3 changed files with 60 additions and 13 deletions

View File

@@ -5,10 +5,10 @@ executors:
resource_class: xlarge
docker:
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
- image: zimg/rust:1.58
- image: neondatabase/rust:1.58
neon-executor:
docker:
- image: zimg/rust:1.58
- image: neondatabase/rust:1.58
jobs:
# A job to build postgres
@@ -37,7 +37,7 @@ jobs:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
# Build postgres if the restore_cache didn't find a build.
# `make` can't figure out whether the cache is valid, since
@@ -54,7 +54,7 @@ jobs:
- save_cache:
name: Save postgres cache
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
paths:
- tmp_install
@@ -85,7 +85,7 @@ jobs:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- restore_cache:
name: Restore rust cache
@@ -93,7 +93,7 @@ jobs:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
- v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
# Build the rust code, including test binaries
- run:
@@ -107,7 +107,7 @@ jobs:
export CARGO_INCREMENTAL=0
export CACHEPOT_BUCKET=zenith-rust-cachepot
export RUSTC_WRAPPER=cachepot
export RUSTC_WRAPPER=""
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
@@ -115,7 +115,7 @@ jobs:
- save_cache:
name: Save rust cache
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
paths:
- ~/.cargo/registry
- ~/.cargo/git
@@ -142,11 +142,6 @@ jobs:
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
test_exe_paths=$(
cargo test --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
mkdir -p /tmp/zenith/bin
mkdir -p /tmp/zenith/test_bin
mkdir -p /tmp/zenith/etc

View File

@@ -623,6 +623,7 @@ impl PostgresRedoProcess {
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
.close_fds()
.output()
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;

View File

@@ -0,0 +1,51 @@
from contextlib import closing
import shutil
import time
import subprocess
import os.path
from cached_property import threading
from fixtures.neon_fixtures import NeonEnv
from fixtures.log_helper import log
def lsof_path() -> str:
path_output = shutil.which("lsof")
if path_output is None:
raise RuntimeError('lsof not found in PATH')
else:
return path_output
# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands.
# This is to test the changes in https://github.com/neondatabase/neon/pull/1834.
def test_lsof_pageserver_pid(neon_simple_env: NeonEnv):
env = neon_simple_env
def start_workload():
env.neon_cli.create_branch("test_lsof_pageserver_pid")
pg = env.postgres.create_start("test_lsof_pageserver_pid")
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x")
cur.execute("update foo set x=x+1")
workload_thread = threading.Thread(target=start_workload, args=(), daemon=True)
workload_thread.start()
path = os.path.join(env.repo_dir, "pageserver.pid")
lsof = lsof_path()
while workload_thread.is_alive():
res = subprocess.run([lsof, path],
check=False,
universal_newlines=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# parse the `lsof` command's output to get only the list of commands
commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]]
if len(commands) > 0:
log.info(f"lsof commands: {commands}")
assert commands == ['pageserve']
time.sleep(1.0)