From 025a0d3cf57a3a36cbe5e3c0d8ebeb6f9c391445 Mon Sep 17 00:00:00 2001 From: David Adam Date: Fri, 10 Jul 2020 22:36:42 +0800 Subject: [PATCH 1/4] proc: add log message for reaped disowned IDs --- src/proc.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/proc.cpp b/src/proc.cpp index f65651c38..b4558eb55 100644 --- a/src/proc.cpp +++ b/src/proc.cpp @@ -341,7 +341,11 @@ static void reap_disowned_pids() { auto disowned_pids = s_disowned_pids.acquire(); auto try_reap1 = [](pid_t pid) { int status; - return waitpid(pid, &status, WNOHANG) > 0; + int ret = waitpid(pid, &status, WNOHANG) > 0; + if (ret) { + FLOGF(proc_reap_external, "Reaped disowned PID or PGID %d", pid); + } + return ret; }; disowned_pids->erase(std::remove_if(disowned_pids->begin(), disowned_pids->end(), try_reap1), disowned_pids->end()); From 2720f3d2ef57e9f10e9ed2eea753c2df6dd88d37 Mon Sep 17 00:00:00 2001 From: David Adam Date: Tue, 14 Jul 2020 22:20:46 +0800 Subject: [PATCH 2/4] proc: disown PIDs, not just PGIDs add_disowned_pgid skipped jobs that have a PGID equal to the running process. However, this includes processes started in config.fish or when job control is turned off, so they never get waited on. Instead, refactor this function to add_disowned_job, and add either the PGID or all the PIDs of the job to the list of disowned PIDs/PGIDs. Fixes #7183. --- src/builtin_disown.cpp | 2 +- src/proc.cpp | 22 +++++++++++++++++----- src/proc.h | 6 +++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/builtin_disown.cpp b/src/builtin_disown.cpp index 3c9822df7..510044803 100644 --- a/src/builtin_disown.cpp +++ b/src/builtin_disown.cpp @@ -36,7 +36,7 @@ static int disown_job(const wchar_t *cmd, parser_t &parser, io_streams_t &stream // within the context of a subjob which will cause the parent job to crash in exec_job(). // Instead, we set a flag and the parser removes the job from the jobs list later. j->mut_flags().disown_requested = true; - if (pgid) add_disowned_pgid(*pgid); + add_disowned_job(j); return STATUS_CMD_OK; } diff --git a/src/proc.cpp b/src/proc.cpp index b4558eb55..268ae2224 100644 --- a/src/proc.cpp +++ b/src/proc.cpp @@ -326,13 +326,23 @@ bool job_t::has_external_proc() const { /// we exit. Poll these from time-to-time to prevent zombie processes from happening (#5342). static owning_lock> s_disowned_pids; -void add_disowned_pgid(pid_t pgid) { +void add_disowned_job(job_t *j) { + if (j == nullptr) return; + // NEVER add our own (or an invalid) pgid as they are not unique to only // one job, and may result in a deadlock if we attempt the wait. - if (pgid != getpgrp() && pgid > 0) { + auto pgid = j->get_pgid(); + if (pgid && *pgid != getpgrp() && *pgid > 0) { // waitpid(2) is signalled to wait on a process group rather than a // process id by using the negative of its value. - s_disowned_pids.acquire()->push_back(pgid * -1); + s_disowned_pids.acquire()->push_back(*pgid * -1); + } else { + // Instead, add the PIDs of any external processes + for (auto &process : j->processes) { + if (process->pid) { + s_disowned_pids.acquire()->push_back(process->pid); + } + } } } @@ -341,12 +351,14 @@ static void reap_disowned_pids() { auto disowned_pids = s_disowned_pids.acquire(); auto try_reap1 = [](pid_t pid) { int status; - int ret = waitpid(pid, &status, WNOHANG) > 0; - if (ret) { + int ret = waitpid(pid, &status, WNOHANG); + if (ret > 0) { FLOGF(proc_reap_external, "Reaped disowned PID or PGID %d", pid); } return ret; }; + // waitpid returns 0 iff the PID/PGID in question has not changed state; remove the pid/pgid + // if it has changed or an error occurs (presumably ECHILD because the child does not exist) disowned_pids->erase(std::remove_if(disowned_pids->begin(), disowned_pids->end(), try_reap1), disowned_pids->end()); } diff --git a/src/proc.h b/src/proc.h index 934eab2b5..f9ce8d50b 100644 --- a/src/proc.h +++ b/src/proc.h @@ -551,9 +551,9 @@ void hup_jobs(const job_list_t &jobs); /// \return 1 if transferred, 0 if no transfer was necessary, -1 on error. int terminal_maybe_give_to_job_group(const job_group_t *jg, bool continuing_from_stopped); -/// Add a pid to the list of pids we wait on even though they are not associated with any jobs. -/// Used to avoid zombie processes after disown. -void add_disowned_pgid(pid_t pgid); +/// Add a job to the list of PIDs/PGIDs we wait on even though they are not associated with any +/// jobs. Used to avoid zombie processes after disown. +void add_disowned_job(job_t *j); bool have_proc_stat(); From 2c5d4937e34a308c29455e04e5fb8d89a4eb918b Mon Sep 17 00:00:00 2001 From: David Adam Date: Tue, 14 Jul 2020 20:34:47 +0800 Subject: [PATCH 3/4] disown: add tests for disowned jobs in scripts --- tests/checks/jobs.fish | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/checks/jobs.fish b/tests/checks/jobs.fish index ed72e827d..1ad32670d 100644 --- a/tests/checks/jobs.fish +++ b/tests/checks/jobs.fish @@ -1,4 +1,19 @@ #RUN: %fish %s + +# Verify zombies are not left by disown (#7183, #5342) +# Do this first to avoid colliding with the other disowned processes below, which may +# still be running at the end of the script +sleep 0.2 & +disown +sleep 0.2 +echo Trigger process reaping +#CHECK: Trigger process reaping +# The initial approach here was to kill the PID of the sleep process, which should +# be gone by the time we get here. Unfortunately, kill from procps on pre-2016 distributions +# does not print an error for non-existent PIDs, so instead look for zombies in this session +# (there should be none). +ps -o state | string match 'Z*' + jobs -q echo $status #CHECK: 1 From d46b9ff9be0f0316d53339f55773be22ace25273 Mon Sep 17 00:00:00 2001 From: Mahmoud Al-Qudsi Date: Sat, 25 Jul 2020 20:31:44 -0500 Subject: [PATCH 4/4] Remove repeated acquire of disowned pid lock in a loop --- src/proc.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/proc.cpp b/src/proc.cpp index 268ae2224..9ab2ceb43 100644 --- a/src/proc.cpp +++ b/src/proc.cpp @@ -329,18 +329,19 @@ static owning_lock> s_disowned_pids; void add_disowned_job(job_t *j) { if (j == nullptr) return; - // NEVER add our own (or an invalid) pgid as they are not unique to only + // Never add our own (or an invalid) pgid as it is not unique to only // one job, and may result in a deadlock if we attempt the wait. auto pgid = j->get_pgid(); + auto disowned_pids = s_disowned_pids.acquire(); if (pgid && *pgid != getpgrp() && *pgid > 0) { // waitpid(2) is signalled to wait on a process group rather than a // process id by using the negative of its value. - s_disowned_pids.acquire()->push_back(*pgid * -1); + disowned_pids->push_back(*pgid * -1); } else { // Instead, add the PIDs of any external processes for (auto &process : j->processes) { if (process->pid) { - s_disowned_pids.acquire()->push_back(process->pid); + disowned_pids->push_back(process->pid); } } }