Logo ROOT  
Reference Guide
util.cxx
Go to the documentation of this file.
1/*
2 * Project: RooFit
3 * Authors:
4 * PB, Patrick Bos, Netherlands eScience Center, p.bos@esciencecenter.nl
5 * IP, Inti Pelupessy, Netherlands eScience Center, i.pelupessy@esciencecenter.nl
6 *
7 * Copyright (c) 2021, CERN
8 *
9 * Redistribution and use in source and binary forms,
10 * with or without modification, are permitted according to the terms
11 * listed in LICENSE (http://roofit.sourceforge.net/license.txt)
12 */
13
16
17#include <csignal> // kill, SIGKILL
18#include <iostream> // cerr, and indirectly WNOHANG, EINTR, W* macros
19#include <stdexcept> // runtime_error
20#include <sys/wait.h> // waitpid
21#include <string>
22
23namespace RooFit {
24namespace MultiProcess {
25
26int wait_for_child(pid_t child_pid, bool may_throw, int retries_before_killing)
27{
28 int status = 0;
29 int patience = retries_before_killing;
30 pid_t tmp;
31 do {
32 if (patience-- < 1) {
33 ::kill(child_pid, SIGKILL);
34 }
35 tmp = waitpid(child_pid, &status, WNOHANG);
36 } while (tmp == 0 // child has not yet changed state, try again
37 || (-1 == tmp && EINTR == errno) // retry on interrupted system call
38 );
39
40 if (patience < 1) {
41 std::cout << "Had to send PID " << child_pid << " " << (-patience + 1) << " SIGKILLs";
42 }
43
44 if (0 != status) {
45 if (WIFEXITED(status)) {
46 printf("exited, status=%d\n", WEXITSTATUS(status));
47 } else if (WIFSIGNALED(status)) {
48 if (WTERMSIG(status) != SIGTERM) {
49 printf("killed by signal %d\n", WTERMSIG(status));
50 }
51 } else if (WIFSTOPPED(status)) {
52 printf("stopped by signal %d\n", WSTOPSIG(status));
53 } else if (WIFCONTINUED(status)) {
54 printf("continued\n");
55 }
56 }
57
58 if (-1 == tmp && may_throw)
59 throw std::runtime_error(std::string("waitpid, errno ") + std::to_string(errno));
60
61 return status;
62}
63
65{
66 if ((e.num() == EINTR) && (ProcessManager::sigterm_received())) {
67 // valid EINTR, because we want to exit and kill the processes on SIGTERM
69 } else if (e.num() == EINTR) {
70 // on other EINTRs, we retry (mostly this happens in debuggers)
72 } else if (e.num() == EAGAIN) {
73 // This can happen from recv if ppoll initially gets a read-ready signal for a socket,
74 // but the received data does not pass the checksum test, so the socket becomes unreadable
75 // again or from non-blocking send if the socket becomes unwritable either due to the HWM
76 // being reached or the socket not being connected (anymore). The latter case usually means
77 // the connection has been severed from the other side, meaning it has probably been killed
78 // and in that case the next ppoll call will probably also receive a SIGTERM, ending the
79 // loop. In case something else is wrong, this message will print multiple times, which
80 // should be taken as a cue for writing a bug report :)
82 } else {
83 char buffer[512];
84 snprintf(buffer, 512,
85 "handle_zmq_ppoll_error is out of options to handle exception, caught ZMQ::ppoll_error_t had errno %d "
86 "and text: %s\n",
87 e.num(), e.what());
88 throw std::logic_error(buffer);
89 }
90}
91
92// returns a tuple containing first the poll result and second a boolean flag that tells the caller whether it should
93// abort the enclosing loop
94std::tuple<std::vector<std::pair<size_t, zmq::event_flags>>, bool>
95careful_ppoll(ZeroMQPoller &poller, const sigset_t &ppoll_sigmask, std::size_t max_tries)
96{
97 std::size_t tries = 0;
98 std::vector<std::pair<size_t, zmq::event_flags>> poll_result;
99 bool abort = true;
100 bool carry_on = true;
101 while (carry_on && (tries++ < max_tries)) {
102 if (tries > 1) {
103 printf("careful_ppoll try %zu\n", tries);
104 }
105 try { // watch for zmq_error from ppoll caused by SIGTERM from master
106 poll_result = poller.ppoll(-1, &ppoll_sigmask);
107 abort = false;
108 carry_on = false;
109 } catch (ZMQ::ppoll_error_t &e) {
110 auto response = handle_zmq_ppoll_error(e);
111 if (response == zmq_ppoll_error_response::abort) {
112 break;
113 } else if (response == zmq_ppoll_error_response::unknown_eintr) {
114 printf("EINTR in careful_ppoll but no SIGTERM received, try %zu\n", tries);
115 continue;
116 } else if (response == zmq_ppoll_error_response::retry) {
117 printf("EAGAIN in careful_ppoll (from either send or receive), try %zu\n", tries);
118 continue;
119 }
120 }
121 }
122
123 if (tries == max_tries) {
124 printf("careful_ppoll reached maximum number of tries, %zu, please report as a bug\n", tries);
125 }
126 return std::make_tuple(poll_result, abort);
127}
128
129} // namespace MultiProcess
130} // namespace RooFit
#define e(i)
Definition: RSha256.hxx:103
#define snprintf
Definition: civetweb.c:1540
Wrapper class for polling ZeroMQ sockets.
Definition: ZeroMQPoller.h:26
std::vector< std::pair< size_t, zmq::event_flags > > ppoll(int timeo, const sigset_t *sigmask)
Poll the sockets with ppoll.
std::tuple< std::vector< std::pair< size_t, zmq::event_flags > >, bool > careful_ppoll(ZeroMQPoller &poller, const sigset_t &ppoll_sigmask, std::size_t max_tries=2)
Definition: util.cxx:95
zmq_ppoll_error_response handle_zmq_ppoll_error(ZMQ::ppoll_error_t &e)
Definition: util.cxx:64
int wait_for_child(pid_t child_pid, bool may_throw, int retries_before_killing)
Definition: util.cxx:26
The namespace RooFit contains mostly switches that change the behaviour of functions of PDFs (or othe...
Definition: Common.h:18