Skip to content

Commit

Permalink
arbiter: Use waitpid() facilities to handle worker exit status
Browse files Browse the repository at this point in the history
This change is meant to handle the return value of waitpid() in a way
that is more in line with the man page of said syscall. The changes
can be summarized as follows:

* Use os.WIFEXITED and os.WIFSIGNALED to determine what caused
  waitpid() to return, and exactly how a worker may have exited.

* In case of normal termination, use os.WEXITSTATUS() to read the exit
  status (instead of using a hand rolled bit shift). A redundant log
  was removed in this code path.

* In case of termination by a signal, use os.WTERMSIG() to determine
  the signal which caused the worker to terminate. This was buggy
  before, since the WCOREFLAG (0x80) could cause e.g. a SIGSEGV (code
  11) to be reported as "code 139", meaning "code (0x80 | 11)".

* Since waitpid() isn't called with WSTOPPED nor WCONTINUED, there's
  no need to have any os.WIFSTOPPED or os.WIFCONTINUED handling.
  • Loading branch information
sylt committed Feb 2, 2024
1 parent 21b6271 commit 97c695b
Showing 1 changed file with 29 additions and 33 deletions.
62 changes: 29 additions & 33 deletions gunicorn/arbiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,46 +521,42 @@ def reap_workers(self):
break
if self.reexec_pid == wpid:
self.reexec_pid = 0
else:
# A worker was terminated. If the termination reason was
# that it could not boot, we'll shut it down to avoid
# infinite start/stop cycles.
exitcode = status >> 8
continue

if os.WIFEXITED(status):
# A worker was normally terminated. If the termination
# reason was that it could not boot, we'll halt the server
# to avoid infinite start/stop cycles.
exitcode = os.WEXITSTATUS(status)
if exitcode != 0:
self.log.error('Worker (pid:%s) exited with code %s', wpid, exitcode)
self.log.error('Worker (pid:%s) exited with code %s',
wpid, exitcode)
if exitcode == self.WORKER_BOOT_ERROR:
reason = "Worker failed to boot."
raise HaltServer(reason, self.WORKER_BOOT_ERROR)
if exitcode == self.APP_LOAD_ERROR:
reason = "App failed to load."
raise HaltServer(reason, self.APP_LOAD_ERROR)

if exitcode > 0:
# If the exit code of the worker is greater than 0,
# let the user know.
self.log.error("Worker (pid:%s) exited with code %s.",
wpid, exitcode)
elif status > 0:
# If the exit code of the worker is 0 and the status
# is greater than 0, then it was most likely killed
# via a signal.
try:
sig_name = signal.Signals(status).name
except ValueError:
sig_name = "code {}".format(status)
msg = "Worker (pid:{}) was sent {}!".format(
wpid, sig_name)

# Additional hint for SIGKILL
if status == signal.SIGKILL:
msg += " Perhaps out of memory?"
self.log.error(msg)

worker = self.WORKERS.pop(wpid, None)
if not worker:
continue
worker.tmp.close()
self.cfg.child_exit(self, worker)
elif os.WIFSIGNALED(status):
# A worker was terminated by a signal.
sig = os.WTERMSIG(status)
try:
sig_name = signal.Signals(sig).name
except ValueError:
sig_name = "signal {}".format(sig)
msg = "Worker (pid:{}) was terminated by {}!".format(
wpid, sig_name)

# Additional hint for SIGKILL
if sig == signal.SIGKILL:
msg += " Perhaps out of memory?"
self.log.error(msg)

worker = self.WORKERS.pop(wpid, None)
if not worker:
continue
worker.tmp.close()
self.cfg.child_exit(self, worker)
except OSError as e:
if e.errno != errno.ECHILD:
raise
Expand Down

0 comments on commit 97c695b

Please sign in to comment.