Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-09-22 23:03:38

0001 #include "FWCore/Utilities/interface/RootHandlers.h"
0002 
0003 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0004 #include "FWCore/ServiceRegistry/interface/SystemBounds.h"
0005 #include "DataFormats/Common/interface/RefCoreStreamer.h"
0006 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0007 #include "FWCore/MessageLogger/interface/ELseverityLevel.h"
0008 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0010 #include "FWCore/PluginManager/interface/PluginCapabilities.h"
0011 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0012 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0013 #include "FWCore/Utilities/interface/EDMException.h"
0014 #include "FWCore/Reflection/interface/TypeWithDict.h"
0015 #include "FWCore/Utilities/interface/UnixSignalHandlers.h"
0016 #include "FWCore/ServiceRegistry/interface/CurrentModuleOnThread.h"
0017 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0018 
0019 #include "oneapi/tbb/concurrent_unordered_set.h"
0020 #include "oneapi/tbb/task.h"
0021 #include "oneapi/tbb/task_scheduler_observer.h"
0022 #include "oneapi/tbb/global_control.h"
0023 #include <memory>
0024 
0025 #include <thread>
0026 #include <sys/wait.h>
0027 #include <sstream>
0028 #include <cstring>
0029 #include <poll.h>
0030 #include <atomic>
0031 #include <algorithm>
0032 #include <vector>
0033 #include <string>
0034 #include <array>
0035 
0036 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
0037 // version.  This can break our stack trace printer.  Avoid this by
0038 // invoking the syscall directly.
0039 #ifdef __linux__
0040 #include <syscall.h>
0041 #endif
0042 
0043 #include "TROOT.h"
0044 #include "TError.h"
0045 #include "TFile.h"
0046 #include "TInterpreter.h"
0047 #include "TH1.h"
0048 #include "TSystem.h"
0049 #include "TUnixSystem.h"
0050 #include "TTree.h"
0051 #include "TVirtualStreamerInfo.h"
0052 
0053 #include "TClassTable.h"
0054 
0055 #include <memory>
0056 
0057 namespace {
0058   // size of static buffer allocated for listing module names following a
0059   // stacktrace abort
0060   constexpr std::size_t moduleBufferSize = 128;
0061 }  // namespace
0062 
0063 namespace edm {
0064   class ConfigurationDescriptions;
0065   class ParameterSet;
0066   class ActivityRegistry;
0067 
0068   namespace service {
0069     class InitRootHandlers : public RootHandlers {
0070       friend int cmssw_stacktrace(void*);
0071 
0072     public:
0073       class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
0074       public:
0075         typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
0076 
0077         ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
0078         ~ThreadTracker() override = default;
0079 
0080         void on_scheduler_entry(bool) override {
0081           // ensure thread local has been allocated; not necessary on Linux with
0082           // the current cmsRun linkage, but could be an issue if the platform
0083           // or linkage leads to "lazy" allocation of the thread local.  By
0084           // referencing it here we make sure it has been allocated and can be
0085           // accessed safely from our signal handler.
0086           edm::CurrentModuleOnThread::getCurrentModuleOnThread();
0087           threadIDs_.insert(pthread_self());
0088         }
0089         void on_scheduler_exit(bool) override {}
0090         const Container_type& IDs() { return threadIDs_; }
0091 
0092       private:
0093         Container_type threadIDs_;
0094       };
0095 
0096       explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
0097       ~InitRootHandlers() override;
0098 
0099       static void fillDescriptions(ConfigurationDescriptions& descriptions);
0100       static void stacktraceFromThread();
0101       static const ThreadTracker::Container_type& threadIDs() {
0102         static const ThreadTracker::Container_type empty;
0103         if (threadTracker_) {
0104           return threadTracker_->IDs();
0105         }
0106         return empty;
0107       }
0108       static int stackTracePause() { return stackTracePause_; }
0109 
0110       static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
0111       static std::atomic<std::size_t> nextModule_, doneModules_;
0112 
0113     private:
0114       static char const* const* getPstackArgv();
0115       void enableWarnings_() override;
0116       void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override;
0117       void willBeUsingThreads() override;
0118 
0119       void cachePidInfo();
0120       static void stacktraceHelperThread();
0121 
0122       static constexpr int pidStringLength_ = 200;
0123       static char pidString_[pidStringLength_];
0124       static char const* const pstackArgv_[];
0125       static int parentToChild_[2];
0126       static int childToParent_[2];
0127       static std::unique_ptr<std::thread> helperThread_;
0128       static std::unique_ptr<ThreadTracker> threadTracker_;
0129       static int stackTracePause_;
0130 
0131       bool unloadSigHandler_;
0132       bool resetErrHandler_;
0133       bool loadAllDictionaries_;
0134       bool autoLibraryLoader_;
0135       bool interactiveDebug_;
0136       std::shared_ptr<const void> sigBusHandler_;
0137       std::shared_ptr<const void> sigSegvHandler_;
0138       std::shared_ptr<const void> sigIllHandler_;
0139       std::shared_ptr<const void> sigTermHandler_;
0140       std::shared_ptr<const void> sigAbrtHandler_;
0141       std::shared_ptr<const void> sigFpeHandler_;
0142     };
0143 
0144     inline bool isProcessWideService(InitRootHandlers const*) { return true; }
0145 
0146   }  // end of namespace service
0147 }  // end of namespace edm
0148 
0149 namespace edm {
0150   namespace service {
0151     int cmssw_stacktrace(void*);
0152   }
0153 }  // namespace edm
0154 
0155 namespace {
0156   thread_local edm::RootHandlers::SeverityLevel s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo;
0157 
0158   constexpr bool s_ignoreEverything = false;
0159 
0160   template <std::size_t SIZE>
0161   bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
0162     return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
0163               return (search.find(s) != std::string::npos);
0164             }) != substrs.end());
0165   }
0166 
0167   //Contents of a message which should be reported as an INFO not a ERROR
0168   constexpr std::array<const char* const, 9> in_message{
0169       {"no dictionary for class",
0170        "already in TClassTable",
0171        "matrix not positive definite",
0172        "not a TStreamerInfo object",
0173        "Problems declaring payload",
0174        "Announced number of args different from the real number of argument passed",  // Always printed if gDebug>0 - regardless of whether warning message is real.
0175        "nbins is <=0 - set to nbins = 1",
0176        "nbinsy is <=0 - set to nbinsy = 1",
0177        "oneapi::tbb::global_control is limiting"}};
0178 
0179   //Location generating messages which should be reported as an INFO not a ERROR
0180   constexpr std::array<const char* const, 7> in_location{{"Fit",
0181                                                           "TDecompChol::Solve",
0182                                                           "THistPainter::PaintInit",
0183                                                           "TUnixSystem::SetDisplay",
0184                                                           "TGClient::GetFontByName",
0185                                                           "Inverter::Dinv",
0186                                                           "RTaskArenaWrapper"}};
0187 
0188   constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
0189                                                                      "bad integrand behavior",
0190                                                                      "integral is divergent, or slowly convergent"}};
0191 
0192   void RootErrorHandlerImpl(int level, char const* location, char const* message) {
0193     bool die = false;
0194 
0195     // Translate ROOT severity level to MessageLogger severity level
0196 
0197     edm::RootHandlers::SeverityLevel el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0198 
0199     if (level >= kFatal) {
0200       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0201     } else if (level >= kSysError) {
0202       el_severity = edm::RootHandlers::SeverityLevel::kSysError;
0203     } else if (level >= kError) {
0204       el_severity = edm::RootHandlers::SeverityLevel::kError;
0205     } else if (level >= kWarning) {
0206       el_severity = edm::RootHandlers::SeverityLevel::kWarning;
0207     }
0208 
0209     if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
0210       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0211     }
0212 
0213     // Adapt C-strings to std::strings
0214     // Arrange to report the error location as furnished by Root
0215 
0216     std::string el_location = "@SUB=?";
0217     if (location != nullptr)
0218       el_location = std::string("@SUB=") + std::string(location);
0219 
0220     std::string el_message = "?";
0221     if (message != nullptr)
0222       el_message = message;
0223 
0224     // Try to create a meaningful id string using knowledge of ROOT error messages
0225     //
0226     // id ==     "ROOT-ClassName" where ClassName is the affected class
0227     //      else "ROOT/ClassName" where ClassName is the error-declaring class
0228     //      else "ROOT"
0229 
0230     std::string el_identifier = "ROOT";
0231 
0232     std::string precursor("class ");
0233     size_t index1 = el_message.find(precursor);
0234     if (index1 != std::string::npos) {
0235       size_t index2 = index1 + precursor.length();
0236       size_t index3 = el_message.find_first_of(" :", index2);
0237       if (index3 != std::string::npos) {
0238         size_t substrlen = index3 - index2;
0239         el_identifier += "-";
0240         el_identifier += el_message.substr(index2, substrlen);
0241       }
0242     } else {
0243       index1 = el_location.find("::");
0244       if (index1 != std::string::npos) {
0245         el_identifier += "/";
0246         el_identifier += el_location.substr(0, index1);
0247       }
0248     }
0249 
0250     // Intercept some messages and upgrade the severity
0251 
0252     if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
0253         (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
0254         (el_message.find("not set") != std::string::npos)) {
0255       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0256     }
0257 
0258     if ((el_message.find("Tree branches") != std::string::npos) &&
0259         (el_message.find("different numbers of entries") != std::string::npos)) {
0260       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0261     }
0262 
0263     // Intercept some messages and downgrade the severity
0264 
0265     if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
0266         (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
0267          (el_message.find("possible entries are in use!") != std::string::npos))) {
0268       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0269     }
0270 
0271     // These are a special case because we do not want them to
0272     // be fatal, but we do want an error to print.
0273     bool alreadyPrinted = false;
0274     if (find_if_string(el_message, in_message_print_error)) {
0275       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0276       edm::LogError("Root_Error") << el_location << el_message;
0277       alreadyPrinted = true;
0278     }
0279 
0280     if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0281       // Don't throw if the message is just informational.
0282       die = false;
0283     } else {
0284       die = true;
0285     }
0286 
0287     // Feed the message to the MessageLogger and let it choose to suppress or not.
0288 
0289     // Root has declared a fatal error.  Throw an EDMException unless the
0290     // message corresponds to a pending signal. In that case, do not throw
0291     // but let the OS deal with the signal in the usual way.
0292     if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
0293       std::ostringstream sstr;
0294       sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
0295       edm::Exception except(edm::errors::FatalRootError, sstr.str());
0296       except.addAdditionalInfo(except.message());
0297       except.clearMessage();
0298       throw except;
0299     }
0300 
0301     // Typically, we get here only for informational messages,
0302     // but we leave the other code in just in case we change
0303     // the criteria for throwing.
0304     if (!alreadyPrinted) {
0305       if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
0306         edm::LogError("Root_Fatal") << el_location << el_message;
0307       } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
0308         edm::LogError("Root_Severe") << el_location << el_message;
0309       } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
0310         edm::LogError("Root_Error") << el_location << el_message;
0311       } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
0312         edm::LogWarning("Root_Warning") << el_location << el_message;
0313       } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0314         edm::LogInfo("Root_Information") << el_location << el_message;
0315       }
0316     }
0317   }
0318 
0319   void RootErrorHandler(int level, bool, char const* location, char const* message) {
0320     RootErrorHandlerImpl(level, location, message);
0321   }
0322 
0323   extern "C" {
0324   void set_default_signals() {
0325     signal(SIGILL, SIG_DFL);
0326     signal(SIGSEGV, SIG_DFL);
0327     signal(SIGBUS, SIG_DFL);
0328     signal(SIGTERM, SIG_DFL);
0329     signal(SIGFPE, SIG_DFL);
0330     signal(SIGABRT, SIG_DFL);
0331   }
0332 
0333   static int full_write(int fd, const char* text) {
0334     const char* buffer = text;
0335     size_t count = strlen(text);
0336     ssize_t written = 0;
0337     while (count) {
0338       written = write(fd, buffer, count);
0339       if (written == -1) {
0340         if (errno == EINTR) {
0341           continue;
0342         } else {
0343           return -errno;
0344         }
0345       }
0346       count -= written;
0347       buffer += written;
0348     }
0349     return 0;
0350   }
0351 
0352   static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
0353     char* buf = inbuf;
0354     size_t count = len;
0355     ssize_t complete = 0;
0356     std::chrono::time_point<std::chrono::steady_clock> end_time =
0357         std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
0358     int flags;
0359     if (timeout_s < 0) {
0360       flags = O_NONBLOCK;  // Prevents us from trying to set / restore flags later.
0361     } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
0362       return -errno;
0363     }
0364     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0365       if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
0366         return -errno;
0367       }
0368     }
0369     while (count) {
0370       if (timeout_s >= 0) {
0371         struct pollfd poll_info {
0372           fd, POLLIN, 0
0373         };
0374         int ms_remaining =
0375             std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
0376         if (ms_remaining > 0) {
0377           int rc = poll(&poll_info, 1, ms_remaining);
0378           if (rc <= 0) {
0379             if (rc < 0) {
0380               if (errno == EINTR || errno == EAGAIN) {
0381                 continue;
0382               }
0383               rc = -errno;
0384             } else {
0385               rc = -ETIMEDOUT;
0386             }
0387             if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0388               fcntl(fd, F_SETFL, flags);
0389             }
0390             return rc;
0391           }
0392         } else if (ms_remaining < 0) {
0393           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0394             fcntl(fd, F_SETFL, flags);
0395           }
0396           return -ETIMEDOUT;
0397         }
0398       }
0399       complete = read(fd, buf, count);
0400       if (complete == -1) {
0401         if (errno == EINTR) {
0402           continue;
0403         } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
0404           continue;
0405         } else {
0406           int orig_errno = errno;
0407           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0408             fcntl(fd, F_SETFL, flags);
0409           }
0410           return -orig_errno;
0411         }
0412       }
0413       count -= complete;
0414       buf += complete;
0415     }
0416     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0417       fcntl(fd, F_SETFL, flags);
0418     }
0419     return 0;
0420   }
0421 
0422   static int full_cerr_write(const char* text) { return full_write(2, text); }
0423 
0424 // these signals are only used inside the stacktrace signal handler,
0425 // so common signals can be used.  They do have to be different, since
0426 // we do not set SA_NODEFER, and RESUME must be a signal that will
0427 // cause sleep() to return early.
0428 #if defined(SIGRTMAX)
0429 #define PAUSE_SIGNAL SIGRTMAX
0430 #define RESUME_SIGNAL SIGRTMAX - 1
0431 #elif defined(SIGINFO)  // macOS/BSD
0432 #define PAUSE_SIGNAL SIGINFO
0433 #define RESUME_SIGNAL SIGALRM
0434 #endif
0435 
0436   // does nothing, here only to interrupt the sleep() in the pause handler
0437   void sig_resume_handler(int sig, siginfo_t*, void*) {}
0438 
0439   // pause a thread so that a (slow) stacktrace will capture the current state
0440   void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
0441     using namespace edm::service;
0442 
0443 #ifdef RESUME_SIGNAL
0444     sigset_t sigset;
0445     sigemptyset(&sigset);
0446     sigaddset(&sigset, RESUME_SIGNAL);
0447     pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
0448 #endif
0449     // sleep interrrupts on a handled delivery of the resume signal
0450     sleep(InitRootHandlers::stackTracePause());
0451 
0452     if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
0453       auto i = InitRootHandlers::nextModule_++;
0454       if (i < InitRootHandlers::moduleListBuffers_.size()) {
0455         char* buff = InitRootHandlers::moduleListBuffers_[i].data();
0456 
0457         strlcpy(buff, "\nModule: ", moduleBufferSize);
0458         if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0459           strlcat(buff,
0460                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0461                   moduleBufferSize);
0462           strlcat(buff, ":", moduleBufferSize);
0463           strlcat(buff,
0464                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0465                   moduleBufferSize);
0466         } else {
0467           strlcat(buff, "none", moduleBufferSize);
0468         }
0469         ++edm::service::InitRootHandlers::doneModules_;
0470       }
0471     }
0472   }
0473 
0474   void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
0475     using namespace edm::service;
0476 
0477     const auto& tids = InitRootHandlers::threadIDs();
0478 
0479     const auto self = pthread_self();
0480 #ifdef PAUSE_SIGNAL
0481     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0482       // install the "pause" handler
0483       struct sigaction act;
0484       act.sa_sigaction = sig_pause_for_stacktrace;
0485       act.sa_flags = 0;
0486       sigemptyset(&act.sa_mask);
0487       sigaction(PAUSE_SIGNAL, &act, nullptr);
0488 
0489       // unblock pause signal globally, resume is unblocked in the pause handler
0490       sigset_t pausesigset;
0491       sigemptyset(&pausesigset);
0492       sigaddset(&pausesigset, PAUSE_SIGNAL);
0493       sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
0494 
0495       // send a pause signal to all CMSSW/TBB threads other than self
0496       for (auto id : tids) {
0497         if (self != id) {
0498           pthread_kill(id, PAUSE_SIGNAL);
0499         }
0500       }
0501 
0502 #ifdef RESUME_SIGNAL
0503       // install the "resume" handler
0504       act.sa_sigaction = sig_resume_handler;
0505       sigaction(RESUME_SIGNAL, &act, nullptr);
0506 #endif
0507     }
0508 #endif
0509 
0510     const char* signalname = "unknown";
0511     switch (sig) {
0512       case SIGBUS: {
0513         signalname = "bus error";
0514         break;
0515       }
0516       case SIGSEGV: {
0517         signalname = "segmentation violation";
0518         break;
0519       }
0520       case SIGILL: {
0521         signalname = "illegal instruction";
0522         break;
0523       }
0524       case SIGFPE: {
0525         signalname = "floating point exception";
0526         break;
0527       }
0528       case SIGTERM: {
0529         signalname = "external termination request";
0530         break;
0531       }
0532       case SIGABRT: {
0533         signalname = "abort signal";
0534         break;
0535       }
0536       default:
0537         break;
0538     }
0539     full_cerr_write("\n\nA fatal system signal has occurred: ");
0540     full_cerr_write(signalname);
0541     full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
0542 
0543     edm::service::InitRootHandlers::stacktraceFromThread();
0544 
0545     // resume the signal handlers to store the current module; we are not guaranteed they
0546     // will have time to store their modules, so there is a race condition; this could be
0547     // avoided by storing the module information before sleeping, a change that may be
0548     // made when we're convinced accessing the thread-local current module is safe.
0549 #ifdef RESUME_SIGNAL
0550     std::size_t notified = 0;
0551     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0552       for (auto id : tids) {
0553         if (self != id) {
0554           if (pthread_kill(id, RESUME_SIGNAL) == 0)
0555             ++notified;
0556         }
0557       }
0558     }
0559 #endif
0560 
0561     full_cerr_write("\nCurrent Modules:\n");
0562 
0563     // Checking tids.count(self) ensures that we only try to access the current module in
0564     // CMSSW/TBB threads.  Those threads access the thread-local current module at the same
0565     // time the thread is registered, so any lazy allocation will have been done at that
0566     // point.  Not necessary on Linux with the current cmsRun linkage, as the thread-local
0567     // is allocated at exec time, not lazily.
0568     if (tids.count(self) > 0) {
0569       char buff[moduleBufferSize] = "\nModule: ";
0570       if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0571         strlcat(buff,
0572                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0573                 moduleBufferSize);
0574         strlcat(buff, ":", moduleBufferSize);
0575         strlcat(buff,
0576                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0577                 moduleBufferSize);
0578       } else {
0579         strlcat(buff, "none", moduleBufferSize);
0580       }
0581       strlcat(buff, " (crashed)", moduleBufferSize);
0582       full_cerr_write(buff);
0583     } else {
0584       full_cerr_write("\nModule: non-CMSSW (crashed)");
0585     }
0586 
0587 #ifdef PAUSE_SIGNAL
0588     // wait a short interval for the paused threads to resume and fill in their module
0589     // information, then print
0590     if (InitRootHandlers::doneModules_.is_lock_free()) {
0591       int spincount = 0;
0592       timespec t = {0, 1000};
0593       while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
0594         nanosleep(&t, nullptr);
0595       }
0596       for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
0597         full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
0598       }
0599     }
0600 #endif
0601 
0602     full_cerr_write("\n\nA fatal system signal has occurred: ");
0603     full_cerr_write(signalname);
0604     full_cerr_write("\n");
0605 
0606     // For these known cases, re-raise the signal to get the correct
0607     // exit code.
0608     if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
0609         (sig == SIGABRT)) {
0610       signal(sig, SIG_DFL);
0611       raise(sig);
0612     } else {
0613       set_default_signals();
0614       ::abort();
0615     }
0616   }
0617 
0618   void sig_abort(int sig, siginfo_t*, void*) {
0619     full_cerr_write("\n\nFatal system signal has occurred during exit\n");
0620 
0621     // re-raise the signal to get the correct exit code
0622     signal(sig, SIG_DFL);
0623     raise(sig);
0624 
0625     // shouldn't get here
0626     set_default_signals();
0627     ::sleep(10);
0628     ::abort();
0629   }
0630   }
0631 }  // end of unnamed namespace
0632 
0633 namespace edm {
0634   namespace service {
0635 
0636     /*
0637      * We've run into issues where GDB fails to print the thread which calls clone().
0638      * To avoid this problem, we have an alternate approach below where the signal handler
0639      * only reads/writes to a dedicated thread via pipes.  The helper thread does the clone()
0640      * invocation; we don't care if that thread is missing from the traceback in this case.
0641      */
0642     static void cmssw_stacktrace_fork();
0643 
0644     void InitRootHandlers::stacktraceHelperThread() {
0645       int toParent = childToParent_[1];
0646       int fromParent = parentToChild_[0];
0647       char buf[2];
0648       buf[1] = '\0';
0649 
0650       while (true) {
0651         int result = full_read(fromParent, buf, 1);
0652         if (result < 0) {
0653           // To avoid a deadlock (this function is NOT re-entrant), reset signals
0654           // We never set them back to the CMSSW handler because we assume the parent
0655           // thread will abort for us.
0656           set_default_signals();
0657           close(toParent);
0658           full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
0659           full_cerr_write(strerror(-result));
0660           full_cerr_write("\n");
0661           ::abort();
0662         }
0663         if (buf[0] == '1') {
0664           set_default_signals();
0665           cmssw_stacktrace_fork();
0666           full_write(toParent, buf);
0667         } else if (buf[0] == '2') {
0668           // We have just finished forking.  Reload the file descriptors for thread
0669           // communication.
0670           close(toParent);
0671           close(fromParent);
0672           toParent = childToParent_[1];
0673           fromParent = parentToChild_[0];
0674         } else if (buf[0] == '3') {
0675           break;
0676         } else {
0677           set_default_signals();
0678           close(toParent);
0679           full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
0680           full_cerr_write(buf);
0681           full_cerr_write("\n");
0682           ::abort();
0683         }
0684       }
0685     }
0686 
0687     void InitRootHandlers::stacktraceFromThread() {
0688       int result = full_write(parentToChild_[1], "1");
0689       if (result < 0) {
0690         full_cerr_write("\n\nAttempt to request stacktrace failed: ");
0691         full_cerr_write(strerror(-result));
0692         full_cerr_write("\n");
0693         return;
0694       }
0695       char buf[2];
0696       buf[1] = '\0';
0697       if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
0698         full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
0699         if (result == -ETIMEDOUT) {
0700           full_cerr_write("timed out waiting for GDB to complete.");
0701         } else {
0702           full_cerr_write(strerror(-result));
0703         }
0704         full_cerr_write("\n");
0705         return;
0706       }
0707     }
0708 
0709     void cmssw_stacktrace_fork() {
0710       char child_stack[4 * 1024];
0711       char* child_stack_ptr = child_stack + 4 * 1024;
0712       // On Linux, we currently use jemalloc.  This registers pthread_atfork handlers; these
0713       // handlers are *not* async-signal safe.  Hence, a deadlock is possible if we invoke
0714       // fork() from our signal handlers.  Accordingly, we use clone (not POSIX, but AS-safe)
0715       // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
0716       int pid =
0717 #ifdef __linux__
0718           clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
0719 #else
0720           fork();
0721       if (child_stack_ptr) {
0722       }  // Suppress 'unused variable' warning on non-Linux
0723       if (pid == 0) {
0724         edm::service::cmssw_stacktrace(nullptr);
0725       }
0726 #endif
0727       if (pid == -1) {
0728         full_cerr_write("(Attempt to perform stack dump failed.)\n");
0729       } else {
0730         int status;
0731         if (waitpid(pid, &status, 0) == -1) {
0732           full_cerr_write("(Failed to wait on stack dump output.)\n");
0733         }
0734         if (status) {
0735           full_cerr_write("(GDB stack trace failed unexpectedly)\n");
0736         }
0737       }
0738     }
0739 
0740     int cmssw_stacktrace(void* /*arg*/) {
0741       set_default_signals();
0742 
0743       char const* const* argv = edm::service::InitRootHandlers::getPstackArgv();
0744       // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
0745       // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
0746       // calls dlsym.
0747 #ifdef __linux__
0748       syscall(SYS_execve, "/bin/sh", argv, __environ);
0749 #else
0750       execv("/bin/sh", argv);
0751 #endif
0752       ::abort();
0753       return 1;
0754     }
0755 
0756     static constexpr char pstackName[] = "(CMSSW stack trace helper)";
0757     static constexpr char dashC[] = "-c";
0758     char InitRootHandlers::pidString_[InitRootHandlers::pidStringLength_] = {};
0759     char const* const InitRootHandlers::pstackArgv_[] = {pstackName, dashC, InitRootHandlers::pidString_, nullptr};
0760     int InitRootHandlers::parentToChild_[2] = {-1, -1};
0761     int InitRootHandlers::childToParent_[2] = {-1, -1};
0762     std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
0763     std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
0764     int InitRootHandlers::stackTracePause_ = 300;
0765     std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
0766     std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
0767 
0768     InitRootHandlers::InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg)
0769         : RootHandlers(),
0770           unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
0771           resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
0772           loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
0773           autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
0774           interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
0775       stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
0776 
0777       if (not threadTracker_) {
0778         threadTracker_ = std::make_unique<ThreadTracker>();
0779         iReg.watchPostEndJob([]() {
0780           if (threadTracker_) {
0781             threadTracker_->observe(false);
0782           }
0783         });
0784       }
0785 
0786       if (unloadSigHandler_) {
0787         // Deactivate all the Root signal handlers and restore the system defaults
0788         gSystem->ResetSignal(kSigChild);
0789         gSystem->ResetSignal(kSigBus);
0790         gSystem->ResetSignal(kSigSegmentationViolation);
0791         gSystem->ResetSignal(kSigIllegalInstruction);
0792         gSystem->ResetSignal(kSigSystem);
0793         gSystem->ResetSignal(kSigPipe);
0794         gSystem->ResetSignal(kSigAlarm);
0795         gSystem->ResetSignal(kSigUrgent);
0796         gSystem->ResetSignal(kSigFloatingException);
0797         gSystem->ResetSignal(kSigWindowChanged);
0798       } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
0799         cachePidInfo();
0800 
0801         //NOTE: ROOT can also be told to abort on these kinds of problems BUT
0802         // it requires an TApplication to be instantiated which causes problems
0803         gSystem->ResetSignal(kSigBus);
0804         gSystem->ResetSignal(kSigSegmentationViolation);
0805         gSystem->ResetSignal(kSigIllegalInstruction);
0806         gSystem->ResetSignal(kSigFloatingException);
0807         installCustomHandler(SIGBUS, sig_dostack_then_abort);
0808         sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
0809         installCustomHandler(SIGSEGV, sig_dostack_then_abort);
0810         sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
0811         installCustomHandler(SIGILL, sig_dostack_then_abort);
0812         sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
0813         installCustomHandler(SIGTERM, sig_dostack_then_abort);
0814         sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
0815         installCustomHandler(SIGFPE, sig_dostack_then_abort);
0816         sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
0817         installCustomHandler(SIGABRT, sig_dostack_then_abort);
0818         sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
0819           signal(SIGABRT, SIG_DFL);  // release SIGABRT to default
0820         });
0821       }
0822 
0823       iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
0824         if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
0825           moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
0826         }
0827       });
0828 
0829       if (resetErrHandler_) {
0830         // Replace the Root error handler with one that uses the MessageLogger
0831         SetErrorHandler(RootErrorHandler);
0832       }
0833 
0834       // Enable automatic Root library loading.
0835       if (autoLibraryLoader_) {
0836         gInterpreter->SetClassAutoloading(1);
0837       }
0838 
0839       // Set ROOT parameters.
0840       TTree::SetMaxTreeSize(kMaxLong64);
0841       TH1::AddDirectory(kFALSE);
0842       //G__SetCatchException(0);
0843 
0844       // Set custom streamers
0845       setRefCoreStreamerInTClass();
0846 
0847       // Load the library containing dictionaries for std:: classes, if not already loaded.
0848       if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
0849         TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
0850       }
0851 
0852       int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
0853       if (debugLevel > 0) {
0854         gDebug = debugLevel;
0855       }
0856 
0857       // Enable Root implicit multi-threading
0858       bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
0859       if (imt && not ROOT::IsImplicitMTEnabled()) {
0860         //cmsRun uses global_control to set the number of allowed threads to use
0861         // we need to tell ROOT the same value in order to avoid unnecessary warnings
0862         ROOT::EnableImplicitMT(
0863             oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
0864       }
0865     }
0866 
0867     InitRootHandlers::~InitRootHandlers() {
0868       // close all open ROOT files
0869       TIter iter(gROOT->GetListOfFiles());
0870       TObject* obj = nullptr;
0871       while (nullptr != (obj = iter.Next())) {
0872         TFile* f = dynamic_cast<TFile*>(obj);
0873         if (f) {
0874           // We get a new iterator each time,
0875           // because closing a file can invalidate the iterator
0876           f->Close();
0877           iter = TIter(gROOT->GetListOfFiles());
0878         }
0879       }
0880       //disengage from TBB to avoid possible at exit problems
0881       threadTracker_.reset();
0882     }
0883 
0884     void InitRootHandlers::willBeUsingThreads() {
0885       //Tell Root we want to be multi-threaded
0886       ROOT::EnableThreadSafety();
0887 
0888       //When threading, also have to keep ROOT from logging all TObjects into a list
0889       TObject::SetObjectStat(false);
0890 
0891       //Have to avoid having Streamers modify themselves after they have been used
0892       TVirtualStreamerInfo::Optimize(false);
0893     }
0894 
0895     void InitRootHandlers::fillDescriptions(ConfigurationDescriptions& descriptions) {
0896       ParameterSetDescription desc;
0897       desc.setComment("Centralized interface to ROOT.");
0898       desc.addUntracked<bool>("UnloadRootSigHandler", false)
0899           ->setComment("If True, signals are handled by this service, rather than by ROOT.");
0900       desc.addUntracked<bool>("ResetRootErrHandler", true)
0901           ->setComment(
0902               "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
0903       desc.addUntracked<bool>("AutoLibraryLoader", true)
0904           ->setComment("If True, enables automatic loading of data dictionaries.");
0905       desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
0906       desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
0907       desc.addUntracked<bool>("AbortOnSignal", true)
0908           ->setComment(
0909               "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
0910               "attempts to do a clean shutdown.");
0911       desc.addUntracked<bool>("InteractiveDebug", false)
0912           ->setComment(
0913               "If True, leave gdb attached to cmsRun after a crash; "
0914               "if False, attach gdb, print a stack trace, and quit gdb");
0915       desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
0916       desc.addUntracked<int>("StackTracePauseTime", 300)
0917           ->setComment("Seconds to pause other threads during stack trace.");
0918       descriptions.add("InitRootHandlers", desc);
0919     }
0920 
0921     char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
0922 
0923     void InitRootHandlers::enableWarnings_() { s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo; }
0924 
0925     void InitRootHandlers::ignoreWarnings_(edm::RootHandlers::SeverityLevel level) { s_ignoreWarnings = level; }
0926 
0927     void InitRootHandlers::cachePidInfo() {
0928       if (helperThread_) {
0929         //Another InitRootHandlers was initialized in this job, possibly
0930         // because multiple EventProcessors are being used.
0931         //In that case, we are already all setup
0932         return;
0933       }
0934       std::string gdbcmd{"date; gdb -quiet -p %d"};
0935       if (!interactiveDebug_) {
0936         gdbcmd +=
0937             " 2>&1 <<EOF |\n"
0938             "set width 0\n"
0939             "set height 0\n"
0940             "set pagination no\n"
0941             "thread apply all bt\n"
0942             "EOF\n"
0943             "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
0944       }
0945       if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
0946         std::ostringstream sstr;
0947         sstr << "Unable to pre-allocate stacktrace handler information";
0948         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0949         throw except;
0950       }
0951 
0952       // These are initialized to -1; harmless to close an invalid FD.
0953       // If this is called post-fork, we don't want to be communicating on
0954       // these FDs as they are used internally by the parent.
0955       close(childToParent_[0]);
0956       close(childToParent_[1]);
0957       childToParent_[0] = -1;
0958       childToParent_[1] = -1;
0959       close(parentToChild_[0]);
0960       close(parentToChild_[1]);
0961       parentToChild_[0] = -1;
0962       parentToChild_[1] = -1;
0963 
0964       if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
0965         std::ostringstream sstr;
0966         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0967         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0968         throw except;
0969       }
0970 
0971       if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
0972         close(childToParent_[0]);
0973         close(childToParent_[1]);
0974         childToParent_[0] = -1;
0975         childToParent_[1] = -1;
0976         std::ostringstream sstr;
0977         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0978         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0979         throw except;
0980       }
0981 
0982       helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
0983       helperThread_->detach();
0984     }
0985 
0986   }  // end of namespace service
0987 }  // end of namespace edm
0988 
0989 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0990 
0991 using edm::service::InitRootHandlers;
0992 typedef edm::serviceregistry::AllArgsMaker<edm::RootHandlers, InitRootHandlers> RootHandlersMaker;
0993 DEFINE_FWK_SERVICE_MAKER(InitRootHandlers, RootHandlersMaker);