Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-06-26 23:21:21

0001 #include "FWCore/Utilities/interface/RootHandlers.h"
0002 
0003 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0004 #include "FWCore/ServiceRegistry/interface/SystemBounds.h"
0005 #include "DataFormats/Common/interface/RefCoreStreamer.h"
0006 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0007 #include "FWCore/MessageLogger/interface/ELseverityLevel.h"
0008 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0010 #include "FWCore/PluginManager/interface/PluginCapabilities.h"
0011 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0012 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0013 #include "FWCore/Utilities/interface/EDMException.h"
0014 #include "FWCore/Reflection/interface/TypeWithDict.h"
0015 #include "FWCore/Utilities/interface/UnixSignalHandlers.h"
0016 #include "FWCore/ServiceRegistry/interface/CurrentModuleOnThread.h"
0017 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0018 
0019 #include "oneapi/tbb/concurrent_unordered_set.h"
0020 #include "oneapi/tbb/task.h"
0021 #include "oneapi/tbb/task_scheduler_observer.h"
0022 #include "oneapi/tbb/global_control.h"
0023 #include <memory>
0024 
0025 #include <thread>
0026 #include <sys/wait.h>
0027 #include <sstream>
0028 #include <cstring>
0029 #include <poll.h>
0030 #include <atomic>
0031 #include <algorithm>
0032 #include <vector>
0033 #include <string>
0034 #include <array>
0035 
0036 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
0037 // version.  This can break our stack trace printer.  Avoid this by
0038 // invoking the syscall directly.
0039 #ifdef __linux__
0040 #include <syscall.h>
0041 #endif
0042 
0043 #include "TROOT.h"
0044 #include "TError.h"
0045 #include "TFile.h"
0046 #include "TInterpreter.h"
0047 #include "TH1.h"
0048 #include "TSystem.h"
0049 #include "TUnixSystem.h"
0050 #include "TTree.h"
0051 #include "TVirtualStreamerInfo.h"
0052 
0053 #include "TClassTable.h"
0054 
0055 #include <memory>
0056 
0057 namespace {
0058   // size of static buffer allocated for listing module names following a
0059   // stacktrace abort
0060   constexpr std::size_t moduleBufferSize = 128;
0061 }  // namespace
0062 
0063 namespace edm {
0064   class ConfigurationDescriptions;
0065   class ParameterSet;
0066   class ActivityRegistry;
0067 
0068   namespace service {
0069     class InitRootHandlers : public RootHandlers {
0070       friend int cmssw_stacktrace(void*);
0071 
0072     public:
0073       class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
0074       public:
0075         typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
0076 
0077         ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
0078         ~ThreadTracker() override = default;
0079 
0080         void on_scheduler_entry(bool) override {
0081           // ensure thread local has been allocated; not necessary on Linux with
0082           // the current cmsRun linkage, but could be an issue if the platform
0083           // or linkage leads to "lazy" allocation of the thread local.  By
0084           // referencing it here we make sure it has been allocated and can be
0085           // accessed safely from our signal handler.
0086           edm::CurrentModuleOnThread::getCurrentModuleOnThread();
0087           threadIDs_.insert(pthread_self());
0088         }
0089         void on_scheduler_exit(bool) override {}
0090         const Container_type& IDs() { return threadIDs_; }
0091 
0092       private:
0093         Container_type threadIDs_;
0094       };
0095 
0096       explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
0097       ~InitRootHandlers() override;
0098 
0099       static void fillDescriptions(ConfigurationDescriptions& descriptions);
0100       static void stacktraceFromThread();
0101       static const ThreadTracker::Container_type& threadIDs() {
0102         static const ThreadTracker::Container_type empty;
0103         if (threadTracker_) {
0104           return threadTracker_->IDs();
0105         }
0106         return empty;
0107       }
0108       static int stackTracePause() { return stackTracePause_; }
0109 
0110       static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
0111       static std::atomic<std::size_t> nextModule_, doneModules_;
0112 
0113     private:
0114       static char const* const* getPstackArgv();
0115       void enableWarnings_() override;
0116       void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override;
0117       void willBeUsingThreads() override;
0118 
0119       void cachePidInfo();
0120       static void stacktraceHelperThread();
0121 
0122       static constexpr int pidStringLength_ = 200;
0123       static char pidString_[pidStringLength_];
0124       static char const* const pstackArgv_[];
0125       static int parentToChild_[2];
0126       static int childToParent_[2];
0127       static std::unique_ptr<std::thread> helperThread_;
0128       static std::unique_ptr<ThreadTracker> threadTracker_;
0129       static int stackTracePause_;
0130 
0131       bool unloadSigHandler_;
0132       bool resetErrHandler_;
0133       bool loadAllDictionaries_;
0134       bool autoLibraryLoader_;
0135       bool autoClassParser_;
0136       bool interactiveDebug_;
0137       std::shared_ptr<const void> sigBusHandler_;
0138       std::shared_ptr<const void> sigSegvHandler_;
0139       std::shared_ptr<const void> sigIllHandler_;
0140       std::shared_ptr<const void> sigTermHandler_;
0141       std::shared_ptr<const void> sigAbrtHandler_;
0142       std::shared_ptr<const void> sigFpeHandler_;
0143     };
0144 
0145     inline bool isProcessWideService(InitRootHandlers const*) { return true; }
0146 
0147   }  // end of namespace service
0148 }  // end of namespace edm
0149 
0150 namespace edm {
0151   namespace service {
0152     int cmssw_stacktrace(void*);
0153   }
0154 }  // namespace edm
0155 
0156 namespace {
0157   thread_local edm::RootHandlers::SeverityLevel s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo;
0158 
0159   constexpr bool s_ignoreEverything = false;
0160 
0161   template <std::size_t SIZE>
0162   bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
0163     return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
0164               return (search.find(s) != std::string::npos);
0165             }) != substrs.end());
0166   }
0167 
0168   //Contents of a message which should be reported as an INFO not a ERROR
0169   constexpr std::array<const char* const, 9> in_message{
0170       {"no dictionary for class",
0171        "already in TClassTable",
0172        "matrix not positive definite",
0173        "not a TStreamerInfo object",
0174        "Problems declaring payload",
0175        "Announced number of args different from the real number of argument passed",  // Always printed if gDebug>0 - regardless of whether warning message is real.
0176        "nbins is <=0 - set to nbins = 1",
0177        "nbinsy is <=0 - set to nbinsy = 1",
0178        "oneapi::tbb::global_control is limiting"}};
0179 
0180   //Location generating messages which should be reported as an INFO not a ERROR
0181   constexpr std::array<const char* const, 7> in_location{{"Fit",
0182                                                           "TDecompChol::Solve",
0183                                                           "THistPainter::PaintInit",
0184                                                           "TUnixSystem::SetDisplay",
0185                                                           "TGClient::GetFontByName",
0186                                                           "Inverter::Dinv",
0187                                                           "RTaskArenaWrapper"}};
0188 
0189   constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
0190                                                                      "bad integrand behavior",
0191                                                                      "integral is divergent, or slowly convergent"}};
0192 
0193   void RootErrorHandlerImpl(int level, char const* location, char const* message) {
0194     bool die = false;
0195 
0196     // Translate ROOT severity level to MessageLogger severity level
0197 
0198     edm::RootHandlers::SeverityLevel el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0199 
0200     if (level >= kFatal) {
0201       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0202     } else if (level >= kSysError) {
0203       el_severity = edm::RootHandlers::SeverityLevel::kSysError;
0204     } else if (level >= kError) {
0205       el_severity = edm::RootHandlers::SeverityLevel::kError;
0206     } else if (level >= kWarning) {
0207       el_severity = edm::RootHandlers::SeverityLevel::kWarning;
0208     }
0209 
0210     if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
0211       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0212     }
0213 
0214     // Adapt C-strings to std::strings
0215     // Arrange to report the error location as furnished by Root
0216 
0217     std::string el_location = "@SUB=?";
0218     if (location != nullptr)
0219       el_location = std::string("@SUB=") + std::string(location);
0220 
0221     std::string el_message = "?";
0222     if (message != nullptr)
0223       el_message = message;
0224 
0225     // Try to create a meaningful id string using knowledge of ROOT error messages
0226     //
0227     // id ==     "ROOT-ClassName" where ClassName is the affected class
0228     //      else "ROOT/ClassName" where ClassName is the error-declaring class
0229     //      else "ROOT"
0230 
0231     std::string el_identifier = "ROOT";
0232 
0233     std::string precursor("class ");
0234     size_t index1 = el_message.find(precursor);
0235     if (index1 != std::string::npos) {
0236       size_t index2 = index1 + precursor.length();
0237       size_t index3 = el_message.find_first_of(" :", index2);
0238       if (index3 != std::string::npos) {
0239         size_t substrlen = index3 - index2;
0240         el_identifier += "-";
0241         el_identifier += el_message.substr(index2, substrlen);
0242       }
0243     } else {
0244       index1 = el_location.find("::");
0245       if (index1 != std::string::npos) {
0246         el_identifier += "/";
0247         el_identifier += el_location.substr(0, index1);
0248       }
0249     }
0250 
0251     // Intercept some messages and upgrade the severity
0252 
0253     if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
0254         (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
0255         (el_message.find("not set") != std::string::npos)) {
0256       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0257     }
0258 
0259     if ((el_message.find("Tree branches") != std::string::npos) &&
0260         (el_message.find("different numbers of entries") != std::string::npos)) {
0261       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0262     }
0263 
0264     // Intercept some messages and downgrade the severity
0265 
0266     if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
0267         (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
0268          (el_message.find("possible entries are in use!") != std::string::npos))) {
0269       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0270     }
0271 
0272     // These are a special case because we do not want them to
0273     // be fatal, but we do want an error to print.
0274     bool alreadyPrinted = false;
0275     if (find_if_string(el_message, in_message_print_error)) {
0276       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0277       edm::LogError("Root_Error") << el_location << el_message;
0278       alreadyPrinted = true;
0279     }
0280 
0281     if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0282       // Don't throw if the message is just informational.
0283       die = false;
0284     } else {
0285       die = true;
0286     }
0287 
0288     // Feed the message to the MessageLogger and let it choose to suppress or not.
0289 
0290     // Root has declared a fatal error.  Throw an EDMException unless the
0291     // message corresponds to a pending signal. In that case, do not throw
0292     // but let the OS deal with the signal in the usual way.
0293     if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
0294       std::ostringstream sstr;
0295       sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
0296       edm::Exception except(edm::errors::FatalRootError, sstr.str());
0297       except.addAdditionalInfo(except.message());
0298       except.clearMessage();
0299       throw except;
0300     }
0301 
0302     // Typically, we get here only for informational messages,
0303     // but we leave the other code in just in case we change
0304     // the criteria for throwing.
0305     if (!alreadyPrinted) {
0306       if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
0307         edm::LogError("Root_Fatal") << el_location << el_message;
0308       } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
0309         edm::LogError("Root_Severe") << el_location << el_message;
0310       } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
0311         edm::LogError("Root_Error") << el_location << el_message;
0312       } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
0313         edm::LogWarning("Root_Warning") << el_location << el_message;
0314       } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0315         edm::LogInfo("Root_Information") << el_location << el_message;
0316       }
0317     }
0318   }
0319 
0320   void RootErrorHandler(int level, bool, char const* location, char const* message) {
0321     RootErrorHandlerImpl(level, location, message);
0322   }
0323 
0324   extern "C" {
0325   void set_default_signals() {
0326     signal(SIGILL, SIG_DFL);
0327     signal(SIGSEGV, SIG_DFL);
0328     signal(SIGBUS, SIG_DFL);
0329     signal(SIGTERM, SIG_DFL);
0330     signal(SIGFPE, SIG_DFL);
0331     signal(SIGABRT, SIG_DFL);
0332   }
0333 
0334   static int full_write(int fd, const char* text) {
0335     const char* buffer = text;
0336     size_t count = strlen(text);
0337     ssize_t written = 0;
0338     while (count) {
0339       written = write(fd, buffer, count);
0340       if (written == -1) {
0341         if (errno == EINTR) {
0342           continue;
0343         } else {
0344           return -errno;
0345         }
0346       }
0347       count -= written;
0348       buffer += written;
0349     }
0350     return 0;
0351   }
0352 
0353   static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
0354     char* buf = inbuf;
0355     size_t count = len;
0356     ssize_t complete = 0;
0357     std::chrono::time_point<std::chrono::steady_clock> end_time =
0358         std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
0359     int flags;
0360     if (timeout_s < 0) {
0361       flags = O_NONBLOCK;  // Prevents us from trying to set / restore flags later.
0362     } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
0363       return -errno;
0364     }
0365     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0366       if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
0367         return -errno;
0368       }
0369     }
0370     while (count) {
0371       if (timeout_s >= 0) {
0372         struct pollfd poll_info {
0373           fd, POLLIN, 0
0374         };
0375         int ms_remaining =
0376             std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
0377         if (ms_remaining > 0) {
0378           int rc = poll(&poll_info, 1, ms_remaining);
0379           if (rc <= 0) {
0380             if (rc < 0) {
0381               if (errno == EINTR || errno == EAGAIN) {
0382                 continue;
0383               }
0384               rc = -errno;
0385             } else {
0386               rc = -ETIMEDOUT;
0387             }
0388             if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0389               fcntl(fd, F_SETFL, flags);
0390             }
0391             return rc;
0392           }
0393         } else if (ms_remaining < 0) {
0394           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0395             fcntl(fd, F_SETFL, flags);
0396           }
0397           return -ETIMEDOUT;
0398         }
0399       }
0400       complete = read(fd, buf, count);
0401       if (complete == -1) {
0402         if (errno == EINTR) {
0403           continue;
0404         } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
0405           continue;
0406         } else {
0407           int orig_errno = errno;
0408           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0409             fcntl(fd, F_SETFL, flags);
0410           }
0411           return -orig_errno;
0412         }
0413       }
0414       count -= complete;
0415       buf += complete;
0416     }
0417     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0418       fcntl(fd, F_SETFL, flags);
0419     }
0420     return 0;
0421   }
0422 
0423   static int full_cerr_write(const char* text) { return full_write(2, text); }
0424 
0425 // these signals are only used inside the stacktrace signal handler,
0426 // so common signals can be used.  They do have to be different, since
0427 // we do not set SA_NODEFER, and RESUME must be a signal that will
0428 // cause sleep() to return early.
0429 #if defined(SIGRTMAX)
0430 #define PAUSE_SIGNAL SIGRTMAX
0431 #define RESUME_SIGNAL SIGRTMAX - 1
0432 #elif defined(SIGINFO)  // macOS/BSD
0433 #define PAUSE_SIGNAL SIGINFO
0434 #define RESUME_SIGNAL SIGALRM
0435 #endif
0436 
0437   // does nothing, here only to interrupt the sleep() in the pause handler
0438   void sig_resume_handler(int sig, siginfo_t*, void*) {}
0439 
0440   // pause a thread so that a (slow) stacktrace will capture the current state
0441   void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
0442     using namespace edm::service;
0443 
0444 #ifdef RESUME_SIGNAL
0445     sigset_t sigset;
0446     sigemptyset(&sigset);
0447     sigaddset(&sigset, RESUME_SIGNAL);
0448     pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
0449 #endif
0450     // sleep interrrupts on a handled delivery of the resume signal
0451     sleep(InitRootHandlers::stackTracePause());
0452 
0453     if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
0454       auto i = InitRootHandlers::nextModule_++;
0455       if (i < InitRootHandlers::moduleListBuffers_.size()) {
0456         char* buff = InitRootHandlers::moduleListBuffers_[i].data();
0457 
0458         strlcpy(buff, "\nModule: ", moduleBufferSize);
0459         if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0460           strlcat(buff,
0461                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0462                   moduleBufferSize);
0463           strlcat(buff, ":", moduleBufferSize);
0464           strlcat(buff,
0465                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0466                   moduleBufferSize);
0467         } else {
0468           strlcat(buff, "none", moduleBufferSize);
0469         }
0470         ++edm::service::InitRootHandlers::doneModules_;
0471       }
0472     }
0473   }
0474 
0475   void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
0476     using namespace edm::service;
0477 
0478     const auto& tids = InitRootHandlers::threadIDs();
0479 
0480     const auto self = pthread_self();
0481 #ifdef PAUSE_SIGNAL
0482     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0483       // install the "pause" handler
0484       struct sigaction act;
0485       act.sa_sigaction = sig_pause_for_stacktrace;
0486       act.sa_flags = 0;
0487       sigemptyset(&act.sa_mask);
0488       sigaction(PAUSE_SIGNAL, &act, nullptr);
0489 
0490       // unblock pause signal globally, resume is unblocked in the pause handler
0491       sigset_t pausesigset;
0492       sigemptyset(&pausesigset);
0493       sigaddset(&pausesigset, PAUSE_SIGNAL);
0494       sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
0495 
0496       // send a pause signal to all CMSSW/TBB threads other than self
0497       for (auto id : tids) {
0498         if (self != id) {
0499           pthread_kill(id, PAUSE_SIGNAL);
0500         }
0501       }
0502 
0503 #ifdef RESUME_SIGNAL
0504       // install the "resume" handler
0505       act.sa_sigaction = sig_resume_handler;
0506       sigaction(RESUME_SIGNAL, &act, nullptr);
0507 #endif
0508     }
0509 #endif
0510 
0511     const char* signalname = "unknown";
0512     switch (sig) {
0513       case SIGBUS: {
0514         signalname = "bus error";
0515         break;
0516       }
0517       case SIGSEGV: {
0518         signalname = "segmentation violation";
0519         break;
0520       }
0521       case SIGILL: {
0522         signalname = "illegal instruction";
0523         break;
0524       }
0525       case SIGFPE: {
0526         signalname = "floating point exception";
0527         break;
0528       }
0529       case SIGTERM: {
0530         signalname = "external termination request";
0531         break;
0532       }
0533       case SIGABRT: {
0534         signalname = "abort signal";
0535         break;
0536       }
0537       default:
0538         break;
0539     }
0540     full_cerr_write("\n\nA fatal system signal has occurred: ");
0541     full_cerr_write(signalname);
0542     full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
0543 
0544     edm::service::InitRootHandlers::stacktraceFromThread();
0545 
0546     // resume the signal handlers to store the current module; we are not guaranteed they
0547     // will have time to store their modules, so there is a race condition; this could be
0548     // avoided by storing the module information before sleeping, a change that may be
0549     // made when we're convinced accessing the thread-local current module is safe.
0550 #ifdef RESUME_SIGNAL
0551     std::size_t notified = 0;
0552     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0553       for (auto id : tids) {
0554         if (self != id) {
0555           if (pthread_kill(id, RESUME_SIGNAL) == 0)
0556             ++notified;
0557         }
0558       }
0559     }
0560 #endif
0561 
0562     full_cerr_write("\nCurrent Modules:\n");
0563 
0564     // Checking tids.count(self) ensures that we only try to access the current module in
0565     // CMSSW/TBB threads.  Those threads access the thread-local current module at the same
0566     // time the thread is registered, so any lazy allocation will have been done at that
0567     // point.  Not necessary on Linux with the current cmsRun linkage, as the thread-local
0568     // is allocated at exec time, not lazily.
0569     if (tids.count(self) > 0) {
0570       char buff[moduleBufferSize] = "\nModule: ";
0571       if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0572         strlcat(buff,
0573                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0574                 moduleBufferSize);
0575         strlcat(buff, ":", moduleBufferSize);
0576         strlcat(buff,
0577                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0578                 moduleBufferSize);
0579       } else {
0580         strlcat(buff, "none", moduleBufferSize);
0581       }
0582       strlcat(buff, " (crashed)", moduleBufferSize);
0583       full_cerr_write(buff);
0584     } else {
0585       full_cerr_write("\nModule: non-CMSSW (crashed)");
0586     }
0587 
0588 #ifdef PAUSE_SIGNAL
0589     // wait a short interval for the paused threads to resume and fill in their module
0590     // information, then print
0591     if (InitRootHandlers::doneModules_.is_lock_free()) {
0592       int spincount = 0;
0593       timespec t = {0, 1000};
0594       while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
0595         nanosleep(&t, nullptr);
0596       }
0597       for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
0598         full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
0599       }
0600     }
0601 #endif
0602 
0603     full_cerr_write("\n\nA fatal system signal has occurred: ");
0604     full_cerr_write(signalname);
0605     full_cerr_write("\n");
0606 
0607     // For these known cases, re-raise the signal to get the correct
0608     // exit code.
0609     if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
0610         (sig == SIGABRT)) {
0611       signal(sig, SIG_DFL);
0612       raise(sig);
0613     } else {
0614       set_default_signals();
0615       ::abort();
0616     }
0617   }
0618 
0619   void sig_abort(int sig, siginfo_t*, void*) {
0620     full_cerr_write("\n\nFatal system signal has occurred during exit\n");
0621 
0622     // re-raise the signal to get the correct exit code
0623     signal(sig, SIG_DFL);
0624     raise(sig);
0625 
0626     // shouldn't get here
0627     set_default_signals();
0628     ::sleep(10);
0629     ::abort();
0630   }
0631   }
0632 }  // end of unnamed namespace
0633 
0634 namespace edm {
0635   namespace service {
0636 
0637     /*
0638      * We've run into issues where GDB fails to print the thread which calls clone().
0639      * To avoid this problem, we have an alternate approach below where the signal handler
0640      * only reads/writes to a dedicated thread via pipes.  The helper thread does the clone()
0641      * invocation; we don't care if that thread is missing from the traceback in this case.
0642      */
0643     static void cmssw_stacktrace_fork();
0644 
0645     void InitRootHandlers::stacktraceHelperThread() {
0646       int toParent = childToParent_[1];
0647       int fromParent = parentToChild_[0];
0648       char buf[2];
0649       buf[1] = '\0';
0650 
0651       while (true) {
0652         int result = full_read(fromParent, buf, 1);
0653         if (result < 0) {
0654           // To avoid a deadlock (this function is NOT re-entrant), reset signals
0655           // We never set them back to the CMSSW handler because we assume the parent
0656           // thread will abort for us.
0657           set_default_signals();
0658           close(toParent);
0659           full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
0660           full_cerr_write(strerror(-result));
0661           full_cerr_write("\n");
0662           ::abort();
0663         }
0664         if (buf[0] == '1') {
0665           set_default_signals();
0666           cmssw_stacktrace_fork();
0667           full_write(toParent, buf);
0668         } else if (buf[0] == '2') {
0669           // We have just finished forking.  Reload the file descriptors for thread
0670           // communication.
0671           close(toParent);
0672           close(fromParent);
0673           toParent = childToParent_[1];
0674           fromParent = parentToChild_[0];
0675         } else if (buf[0] == '3') {
0676           break;
0677         } else {
0678           set_default_signals();
0679           close(toParent);
0680           full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
0681           full_cerr_write(buf);
0682           full_cerr_write("\n");
0683           ::abort();
0684         }
0685       }
0686     }
0687 
0688     void InitRootHandlers::stacktraceFromThread() {
0689       int result = full_write(parentToChild_[1], "1");
0690       if (result < 0) {
0691         full_cerr_write("\n\nAttempt to request stacktrace failed: ");
0692         full_cerr_write(strerror(-result));
0693         full_cerr_write("\n");
0694         return;
0695       }
0696       char buf[2];
0697       buf[1] = '\0';
0698       if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
0699         full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
0700         if (result == -ETIMEDOUT) {
0701           full_cerr_write("timed out waiting for GDB to complete.");
0702         } else {
0703           full_cerr_write(strerror(-result));
0704         }
0705         full_cerr_write("\n");
0706         return;
0707       }
0708     }
0709 
0710     void cmssw_stacktrace_fork() {
0711       char child_stack[4 * 1024];
0712       char* child_stack_ptr = child_stack + 4 * 1024;
0713       // On Linux, we currently use jemalloc.  This registers pthread_atfork handlers; these
0714       // handlers are *not* async-signal safe.  Hence, a deadlock is possible if we invoke
0715       // fork() from our signal handlers.  Accordingly, we use clone (not POSIX, but AS-safe)
0716       // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
0717       int pid =
0718 #ifdef __linux__
0719           clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
0720 #else
0721           fork();
0722       if (child_stack_ptr) {
0723       }  // Suppress 'unused variable' warning on non-Linux
0724       if (pid == 0) {
0725         edm::service::cmssw_stacktrace(nullptr);
0726       }
0727 #endif
0728       if (pid == -1) {
0729         full_cerr_write("(Attempt to perform stack dump failed.)\n");
0730       } else {
0731         int status;
0732         if (waitpid(pid, &status, 0) == -1) {
0733           full_cerr_write("(Failed to wait on stack dump output.)\n");
0734         }
0735         if (status) {
0736           full_cerr_write("(GDB stack trace failed unexpectedly)\n");
0737         }
0738       }
0739     }
0740 
0741     int cmssw_stacktrace(void* /*arg*/) {
0742       set_default_signals();
0743 
0744       char const* const* argv = edm::service::InitRootHandlers::getPstackArgv();
0745       // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
0746       // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
0747       // calls dlsym.
0748 #ifdef __linux__
0749       syscall(SYS_execve, "/bin/sh", argv, __environ);
0750 #else
0751       execv("/bin/sh", argv);
0752 #endif
0753       ::abort();
0754       return 1;
0755     }
0756 
0757     static constexpr char pstackName[] = "(CMSSW stack trace helper)";
0758     static constexpr char dashC[] = "-c";
0759     char InitRootHandlers::pidString_[InitRootHandlers::pidStringLength_] = {};
0760     char const* const InitRootHandlers::pstackArgv_[] = {pstackName, dashC, InitRootHandlers::pidString_, nullptr};
0761     int InitRootHandlers::parentToChild_[2] = {-1, -1};
0762     int InitRootHandlers::childToParent_[2] = {-1, -1};
0763     std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
0764     std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
0765     int InitRootHandlers::stackTracePause_ = 300;
0766     std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
0767     std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
0768 
0769     InitRootHandlers::InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg)
0770         : RootHandlers(),
0771           unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
0772           resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
0773           loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
0774           autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
0775           autoClassParser_(pset.getUntrackedParameter<bool>("AutoClassParser")),
0776           interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
0777       stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
0778 
0779       if (not threadTracker_) {
0780         threadTracker_ = std::make_unique<ThreadTracker>();
0781         iReg.watchPostEndJob([]() {
0782           if (threadTracker_) {
0783             threadTracker_->observe(false);
0784           }
0785         });
0786       }
0787 
0788       if (unloadSigHandler_) {
0789         // Deactivate all the Root signal handlers and restore the system defaults
0790         gSystem->ResetSignal(kSigChild);
0791         gSystem->ResetSignal(kSigBus);
0792         gSystem->ResetSignal(kSigSegmentationViolation);
0793         gSystem->ResetSignal(kSigIllegalInstruction);
0794         gSystem->ResetSignal(kSigSystem);
0795         gSystem->ResetSignal(kSigPipe);
0796         gSystem->ResetSignal(kSigAlarm);
0797         gSystem->ResetSignal(kSigUrgent);
0798         gSystem->ResetSignal(kSigFloatingException);
0799         gSystem->ResetSignal(kSigWindowChanged);
0800       } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
0801         cachePidInfo();
0802 
0803         //NOTE: ROOT can also be told to abort on these kinds of problems BUT
0804         // it requires an TApplication to be instantiated which causes problems
0805         gSystem->ResetSignal(kSigBus);
0806         gSystem->ResetSignal(kSigSegmentationViolation);
0807         gSystem->ResetSignal(kSigIllegalInstruction);
0808         gSystem->ResetSignal(kSigFloatingException);
0809         installCustomHandler(SIGBUS, sig_dostack_then_abort);
0810         sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
0811         installCustomHandler(SIGSEGV, sig_dostack_then_abort);
0812         sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
0813         installCustomHandler(SIGILL, sig_dostack_then_abort);
0814         sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
0815         installCustomHandler(SIGTERM, sig_dostack_then_abort);
0816         sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
0817         installCustomHandler(SIGFPE, sig_dostack_then_abort);
0818         sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
0819         installCustomHandler(SIGABRT, sig_dostack_then_abort);
0820         sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
0821           signal(SIGABRT, SIG_DFL);  // release SIGABRT to default
0822         });
0823       }
0824 
0825       iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
0826         if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
0827           moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
0828         }
0829       });
0830 
0831       if (resetErrHandler_) {
0832         // Replace the Root error handler with one that uses the MessageLogger
0833         SetErrorHandler(RootErrorHandler);
0834       }
0835 
0836       // Enable automatic Root library loading.
0837       if (autoLibraryLoader_) {
0838         gInterpreter->SetClassAutoloading(1);
0839       }
0840 
0841       // Enable/disable automatic parsing of headers
0842       if (not autoClassParser_) {
0843         // Disable automatic parsing of headers during module construction
0844         iReg.watchPreModuleConstruction(
0845             [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(false); });
0846         iReg.watchPostModuleConstruction(
0847             [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(true); });
0848       }
0849 
0850       // Set ROOT parameters.
0851       TTree::SetMaxTreeSize(kMaxLong64);
0852       TH1::AddDirectory(kFALSE);
0853       //G__SetCatchException(0);
0854 
0855       // Set custom streamers
0856       setRefCoreStreamerInTClass();
0857 
0858       // Load the library containing dictionaries for std:: classes, if not already loaded.
0859       if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
0860         TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
0861       }
0862 
0863       int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
0864       if (debugLevel > 0) {
0865         gDebug = debugLevel;
0866       }
0867 
0868       // Enable Root implicit multi-threading
0869       bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
0870       if (imt && not ROOT::IsImplicitMTEnabled()) {
0871         //cmsRun uses global_control to set the number of allowed threads to use
0872         // we need to tell ROOT the same value in order to avoid unnecessary warnings
0873         ROOT::EnableImplicitMT(
0874             oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
0875       }
0876     }
0877 
0878     InitRootHandlers::~InitRootHandlers() {
0879       // close all open ROOT files
0880       TIter iter(gROOT->GetListOfFiles());
0881       TObject* obj = nullptr;
0882       while (nullptr != (obj = iter.Next())) {
0883         TFile* f = dynamic_cast<TFile*>(obj);
0884         if (f) {
0885           // We get a new iterator each time,
0886           // because closing a file can invalidate the iterator
0887           f->Close();
0888           iter = TIter(gROOT->GetListOfFiles());
0889         }
0890       }
0891       //disengage from TBB to avoid possible at exit problems
0892       threadTracker_.reset();
0893     }
0894 
0895     void InitRootHandlers::willBeUsingThreads() {
0896       //Tell Root we want to be multi-threaded
0897       ROOT::EnableThreadSafety();
0898 
0899       //When threading, also have to keep ROOT from logging all TObjects into a list
0900       TObject::SetObjectStat(false);
0901 
0902       //Have to avoid having Streamers modify themselves after they have been used
0903       TVirtualStreamerInfo::Optimize(false);
0904     }
0905 
0906     void InitRootHandlers::fillDescriptions(ConfigurationDescriptions& descriptions) {
0907       ParameterSetDescription desc;
0908       desc.setComment("Centralized interface to ROOT.");
0909       desc.addUntracked<bool>("UnloadRootSigHandler", false)
0910           ->setComment("If True, signals are handled by this service, rather than by ROOT.");
0911       desc.addUntracked<bool>("ResetRootErrHandler", true)
0912           ->setComment(
0913               "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
0914       desc.addUntracked<bool>("AutoLibraryLoader", true)
0915           ->setComment("If True, enables automatic loading of data dictionaries.");
0916       desc.addUntracked<bool>("AutoClassParser", true)
0917           ->setComment(
0918               "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are "
0919               "missing is disable during module construction. The current implementation of disabling the parsing is "
0920               "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or "
0921               "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
0922       desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
0923       desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
0924       desc.addUntracked<bool>("AbortOnSignal", true)
0925           ->setComment(
0926               "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
0927               "attempts to do a clean shutdown.");
0928       desc.addUntracked<bool>("InteractiveDebug", false)
0929           ->setComment(
0930               "If True, leave gdb attached to cmsRun after a crash; "
0931               "if False, attach gdb, print a stack trace, and quit gdb");
0932       desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
0933       desc.addUntracked<int>("StackTracePauseTime", 300)
0934           ->setComment("Seconds to pause other threads during stack trace.");
0935       descriptions.add("InitRootHandlers", desc);
0936     }
0937 
0938     char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
0939 
0940     void InitRootHandlers::enableWarnings_() { s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo; }
0941 
0942     void InitRootHandlers::ignoreWarnings_(edm::RootHandlers::SeverityLevel level) { s_ignoreWarnings = level; }
0943 
0944     void InitRootHandlers::cachePidInfo() {
0945       if (helperThread_) {
0946         //Another InitRootHandlers was initialized in this job, possibly
0947         // because multiple EventProcessors are being used.
0948         //In that case, we are already all setup
0949         return;
0950       }
0951       std::string gdbcmd{"date; gdb -quiet -p %d"};
0952       if (!interactiveDebug_) {
0953         gdbcmd +=
0954             " 2>&1 <<EOF |\n"
0955             "set width 0\n"
0956             "set height 0\n"
0957             "set pagination no\n"
0958             "thread apply all bt\n"
0959             "EOF\n"
0960             "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
0961       }
0962       if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
0963         std::ostringstream sstr;
0964         sstr << "Unable to pre-allocate stacktrace handler information";
0965         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0966         throw except;
0967       }
0968 
0969       // These are initialized to -1; harmless to close an invalid FD.
0970       // If this is called post-fork, we don't want to be communicating on
0971       // these FDs as they are used internally by the parent.
0972       close(childToParent_[0]);
0973       close(childToParent_[1]);
0974       childToParent_[0] = -1;
0975       childToParent_[1] = -1;
0976       close(parentToChild_[0]);
0977       close(parentToChild_[1]);
0978       parentToChild_[0] = -1;
0979       parentToChild_[1] = -1;
0980 
0981       if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
0982         std::ostringstream sstr;
0983         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0984         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0985         throw except;
0986       }
0987 
0988       if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
0989         close(childToParent_[0]);
0990         close(childToParent_[1]);
0991         childToParent_[0] = -1;
0992         childToParent_[1] = -1;
0993         std::ostringstream sstr;
0994         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0995         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0996         throw except;
0997       }
0998 
0999       helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
1000       helperThread_->detach();
1001     }
1002 
1003   }  // end of namespace service
1004 }  // end of namespace edm
1005 
1006 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
1007 
1008 using edm::service::InitRootHandlers;
1009 typedef edm::serviceregistry::AllArgsMaker<edm::RootHandlers, InitRootHandlers> RootHandlersMaker;
1010 DEFINE_FWK_SERVICE_MAKER(InitRootHandlers, RootHandlersMaker);