Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-06-11 03:13:10

0001 #include "FWCore/Utilities/interface/RootHandlers.h"
0002 
0003 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0004 #include "FWCore/ServiceRegistry/interface/SystemBounds.h"
0005 #include "DataFormats/Common/interface/RefCoreStreamer.h"
0006 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0007 #include "FWCore/MessageLogger/interface/ELseverityLevel.h"
0008 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0010 #include "FWCore/PluginManager/interface/PluginCapabilities.h"
0011 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0012 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0013 #include "FWCore/Utilities/interface/EDMException.h"
0014 #include "FWCore/Reflection/interface/TypeWithDict.h"
0015 #include "FWCore/Utilities/interface/UnixSignalHandlers.h"
0016 #include "FWCore/ServiceRegistry/interface/CurrentModuleOnThread.h"
0017 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0018 
0019 #include "oneapi/tbb/concurrent_unordered_set.h"
0020 #include "oneapi/tbb/task.h"
0021 #include "oneapi/tbb/task_scheduler_observer.h"
0022 #include "oneapi/tbb/global_control.h"
0023 #include <memory>
0024 
0025 #include <thread>
0026 #include <sys/wait.h>
0027 #include <sstream>
0028 #include <cstring>
0029 #include <poll.h>
0030 #include <atomic>
0031 #include <algorithm>
0032 #include <vector>
0033 #include <string>
0034 #include <array>
0035 
0036 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
0037 // version.  This can break our stack trace printer.  Avoid this by
0038 // invoking the syscall directly.
0039 #ifdef __linux__
0040 #include <syscall.h>
0041 #endif
0042 
0043 #include "TROOT.h"
0044 #include "TError.h"
0045 #include "TFile.h"
0046 #include "TInterpreter.h"
0047 #include "TH1.h"
0048 #include "TSystem.h"
0049 #include "TUnixSystem.h"
0050 #include "TTree.h"
0051 #include "TVirtualStreamerInfo.h"
0052 
0053 #include "TClassTable.h"
0054 
0055 #include <memory>
0056 
0057 namespace {
0058   // size of static buffer allocated for listing module names following a
0059   // stacktrace abort
0060   constexpr std::size_t moduleBufferSize = 128;
0061 }  // namespace
0062 
0063 namespace edm {
0064   class ConfigurationDescriptions;
0065   class ParameterSet;
0066   class ActivityRegistry;
0067 
0068   namespace service {
0069     class InitRootHandlers : public RootHandlers {
0070       friend int cmssw_stacktrace(void*);
0071 
0072     public:
0073       class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
0074       public:
0075         typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
0076 
0077         ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
0078         ~ThreadTracker() override = default;
0079 
0080         void on_scheduler_entry(bool) override {
0081           // ensure thread local has been allocated; not necessary on Linux with
0082           // the current cmsRun linkage, but could be an issue if the platform
0083           // or linkage leads to "lazy" allocation of the thread local.  By
0084           // referencing it here we make sure it has been allocated and can be
0085           // accessed safely from our signal handler.
0086           edm::CurrentModuleOnThread::getCurrentModuleOnThread();
0087           threadIDs_.insert(pthread_self());
0088         }
0089         void on_scheduler_exit(bool) override {}
0090         const Container_type& IDs() { return threadIDs_; }
0091 
0092       private:
0093         Container_type threadIDs_;
0094       };
0095 
0096       explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
0097       ~InitRootHandlers() override;
0098 
0099       static void fillDescriptions(ConfigurationDescriptions& descriptions);
0100       static void stacktraceFromThread();
0101       static const ThreadTracker::Container_type& threadIDs() {
0102         static const ThreadTracker::Container_type empty;
0103         if (threadTracker_) {
0104           return threadTracker_->IDs();
0105         }
0106         return empty;
0107       }
0108       static int stackTracePause() { return stackTracePause_; }
0109 
0110       static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
0111       static std::atomic<std::size_t> nextModule_, doneModules_;
0112 
0113     private:
0114       static char const* const* getPstackArgv();
0115       void enableWarnings_() override;
0116       void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override;
0117       void willBeUsingThreads() override;
0118 
0119       void cachePidInfo();
0120       static void stacktraceHelperThread();
0121 
0122       static constexpr int pidStringLength_ = 200;
0123       static char pidString_[pidStringLength_];
0124       static char const* const pstackArgv_[];
0125       static int parentToChild_[2];
0126       static int childToParent_[2];
0127       static std::unique_ptr<std::thread> helperThread_;
0128       static std::unique_ptr<ThreadTracker> threadTracker_;
0129       static int stackTracePause_;
0130 
0131       bool unloadSigHandler_;
0132       bool resetErrHandler_;
0133       bool loadAllDictionaries_;
0134       bool autoLibraryLoader_;
0135       bool interactiveDebug_;
0136       std::shared_ptr<const void> sigBusHandler_;
0137       std::shared_ptr<const void> sigSegvHandler_;
0138       std::shared_ptr<const void> sigIllHandler_;
0139       std::shared_ptr<const void> sigTermHandler_;
0140       std::shared_ptr<const void> sigAbrtHandler_;
0141     };
0142 
0143     inline bool isProcessWideService(InitRootHandlers const*) { return true; }
0144 
0145   }  // end of namespace service
0146 }  // end of namespace edm
0147 
0148 namespace edm {
0149   namespace service {
0150     int cmssw_stacktrace(void*);
0151   }
0152 }  // namespace edm
0153 
0154 namespace {
0155   thread_local edm::RootHandlers::SeverityLevel s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo;
0156 
0157   constexpr bool s_ignoreEverything = false;
0158 
0159   template <std::size_t SIZE>
0160   bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
0161     return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
0162               return (search.find(s) != std::string::npos);
0163             }) != substrs.end());
0164   }
0165 
0166   //Contents of a message which should be reported as an INFO not a ERROR
0167   constexpr std::array<const char* const, 9> in_message{
0168       {"no dictionary for class",
0169        "already in TClassTable",
0170        "matrix not positive definite",
0171        "not a TStreamerInfo object",
0172        "Problems declaring payload",
0173        "Announced number of args different from the real number of argument passed",  // Always printed if gDebug>0 - regardless of whether warning message is real.
0174        "nbins is <=0 - set to nbins = 1",
0175        "nbinsy is <=0 - set to nbinsy = 1",
0176        "oneapi::tbb::global_control is limiting"}};
0177 
0178   //Location generating messages which should be reported as an INFO not a ERROR
0179   constexpr std::array<const char* const, 7> in_location{{"Fit",
0180                                                           "TDecompChol::Solve",
0181                                                           "THistPainter::PaintInit",
0182                                                           "TUnixSystem::SetDisplay",
0183                                                           "TGClient::GetFontByName",
0184                                                           "Inverter::Dinv",
0185                                                           "RTaskArenaWrapper"}};
0186 
0187   constexpr std::array<const char* const, 3> in_message_print_error{{"number of iterations was insufficient",
0188                                                                      "bad integrand behavior",
0189                                                                      "integral is divergent, or slowly convergent"}};
0190 
0191   void RootErrorHandlerImpl(int level, char const* location, char const* message) {
0192     bool die = false;
0193 
0194     // Translate ROOT severity level to MessageLogger severity level
0195 
0196     edm::RootHandlers::SeverityLevel el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0197 
0198     if (level >= kFatal) {
0199       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0200     } else if (level >= kSysError) {
0201       el_severity = edm::RootHandlers::SeverityLevel::kSysError;
0202     } else if (level >= kError) {
0203       el_severity = edm::RootHandlers::SeverityLevel::kError;
0204     } else if (level >= kWarning) {
0205       el_severity = edm::RootHandlers::SeverityLevel::kWarning;
0206     }
0207 
0208     if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
0209       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0210     }
0211 
0212     // Adapt C-strings to std::strings
0213     // Arrange to report the error location as furnished by Root
0214 
0215     std::string el_location = "@SUB=?";
0216     if (location != nullptr)
0217       el_location = std::string("@SUB=") + std::string(location);
0218 
0219     std::string el_message = "?";
0220     if (message != nullptr)
0221       el_message = message;
0222 
0223     // Try to create a meaningful id string using knowledge of ROOT error messages
0224     //
0225     // id ==     "ROOT-ClassName" where ClassName is the affected class
0226     //      else "ROOT/ClassName" where ClassName is the error-declaring class
0227     //      else "ROOT"
0228 
0229     std::string el_identifier = "ROOT";
0230 
0231     std::string precursor("class ");
0232     size_t index1 = el_message.find(precursor);
0233     if (index1 != std::string::npos) {
0234       size_t index2 = index1 + precursor.length();
0235       size_t index3 = el_message.find_first_of(" :", index2);
0236       if (index3 != std::string::npos) {
0237         size_t substrlen = index3 - index2;
0238         el_identifier += "-";
0239         el_identifier += el_message.substr(index2, substrlen);
0240       }
0241     } else {
0242       index1 = el_location.find("::");
0243       if (index1 != std::string::npos) {
0244         el_identifier += "/";
0245         el_identifier += el_location.substr(0, index1);
0246       }
0247     }
0248 
0249     // Intercept some messages and upgrade the severity
0250 
0251     if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
0252         (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
0253         (el_message.find("not set") != std::string::npos)) {
0254       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0255     }
0256 
0257     if ((el_message.find("Tree branches") != std::string::npos) &&
0258         (el_message.find("different numbers of entries") != std::string::npos)) {
0259       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0260     }
0261 
0262     // Intercept some messages and downgrade the severity
0263 
0264     if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
0265         (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
0266          (el_message.find("possible entries are in use!") != std::string::npos))) {
0267       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0268     }
0269 
0270     // These are a special case because we do not want them to
0271     // be fatal, but we do want an error to print.
0272     bool alreadyPrinted = false;
0273     if (find_if_string(el_message, in_message_print_error)) {
0274       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0275       edm::LogError("Root_Error") << el_location << el_message;
0276       alreadyPrinted = true;
0277     }
0278 
0279     if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0280       // Don't throw if the message is just informational.
0281       die = false;
0282     } else {
0283       die = true;
0284     }
0285 
0286     // Feed the message to the MessageLogger and let it choose to suppress or not.
0287 
0288     // Root has declared a fatal error.  Throw an EDMException unless the
0289     // message corresponds to a pending signal. In that case, do not throw
0290     // but let the OS deal with the signal in the usual way.
0291     if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
0292       std::ostringstream sstr;
0293       sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
0294       edm::Exception except(edm::errors::FatalRootError, sstr.str());
0295       except.addAdditionalInfo(except.message());
0296       except.clearMessage();
0297       throw except;
0298     }
0299 
0300     // Typically, we get here only for informational messages,
0301     // but we leave the other code in just in case we change
0302     // the criteria for throwing.
0303     if (!alreadyPrinted) {
0304       if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
0305         edm::LogError("Root_Fatal") << el_location << el_message;
0306       } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
0307         edm::LogError("Root_Severe") << el_location << el_message;
0308       } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
0309         edm::LogError("Root_Error") << el_location << el_message;
0310       } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
0311         edm::LogWarning("Root_Warning") << el_location << el_message;
0312       } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0313         edm::LogInfo("Root_Information") << el_location << el_message;
0314       }
0315     }
0316   }
0317 
0318   void RootErrorHandler(int level, bool, char const* location, char const* message) {
0319     RootErrorHandlerImpl(level, location, message);
0320   }
0321 
0322   extern "C" {
0323   void set_default_signals() {
0324     signal(SIGILL, SIG_DFL);
0325     signal(SIGSEGV, SIG_DFL);
0326     signal(SIGBUS, SIG_DFL);
0327     signal(SIGTERM, SIG_DFL);
0328     signal(SIGABRT, SIG_DFL);
0329   }
0330 
0331   static int full_write(int fd, const char* text) {
0332     const char* buffer = text;
0333     size_t count = strlen(text);
0334     ssize_t written = 0;
0335     while (count) {
0336       written = write(fd, buffer, count);
0337       if (written == -1) {
0338         if (errno == EINTR) {
0339           continue;
0340         } else {
0341           return -errno;
0342         }
0343       }
0344       count -= written;
0345       buffer += written;
0346     }
0347     return 0;
0348   }
0349 
0350   static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
0351     char* buf = inbuf;
0352     size_t count = len;
0353     ssize_t complete = 0;
0354     std::chrono::time_point<std::chrono::steady_clock> end_time =
0355         std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
0356     int flags;
0357     if (timeout_s < 0) {
0358       flags = O_NONBLOCK;  // Prevents us from trying to set / restore flags later.
0359     } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
0360       return -errno;
0361     }
0362     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0363       if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
0364         return -errno;
0365       }
0366     }
0367     while (count) {
0368       if (timeout_s >= 0) {
0369         struct pollfd poll_info {
0370           fd, POLLIN, 0
0371         };
0372         int ms_remaining =
0373             std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
0374         if (ms_remaining > 0) {
0375           int rc = poll(&poll_info, 1, ms_remaining);
0376           if (rc <= 0) {
0377             if (rc < 0) {
0378               if (errno == EINTR || errno == EAGAIN) {
0379                 continue;
0380               }
0381               rc = -errno;
0382             } else {
0383               rc = -ETIMEDOUT;
0384             }
0385             if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0386               fcntl(fd, F_SETFL, flags);
0387             }
0388             return rc;
0389           }
0390         } else if (ms_remaining < 0) {
0391           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0392             fcntl(fd, F_SETFL, flags);
0393           }
0394           return -ETIMEDOUT;
0395         }
0396       }
0397       complete = read(fd, buf, count);
0398       if (complete == -1) {
0399         if (errno == EINTR) {
0400           continue;
0401         } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
0402           continue;
0403         } else {
0404           int orig_errno = errno;
0405           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0406             fcntl(fd, F_SETFL, flags);
0407           }
0408           return -orig_errno;
0409         }
0410       }
0411       count -= complete;
0412       buf += complete;
0413     }
0414     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0415       fcntl(fd, F_SETFL, flags);
0416     }
0417     return 0;
0418   }
0419 
0420   static int full_cerr_write(const char* text) { return full_write(2, text); }
0421 
0422 // these signals are only used inside the stacktrace signal handler,
0423 // so common signals can be used.  They do have to be different, since
0424 // we do not set SA_NODEFER, and RESUME must be a signal that will
0425 // cause sleep() to return early.
0426 #if defined(SIGRTMAX)
0427 #define PAUSE_SIGNAL SIGRTMAX
0428 #define RESUME_SIGNAL SIGRTMAX - 1
0429 #elif defined(SIGINFO)  // macOS/BSD
0430 #define PAUSE_SIGNAL SIGINFO
0431 #define RESUME_SIGNAL SIGALRM
0432 #endif
0433 
0434   // does nothing, here only to interrupt the sleep() in the pause handler
0435   void sig_resume_handler(int sig, siginfo_t*, void*) {}
0436 
0437   // pause a thread so that a (slow) stacktrace will capture the current state
0438   void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
0439     using namespace edm::service;
0440 
0441 #ifdef RESUME_SIGNAL
0442     sigset_t sigset;
0443     sigemptyset(&sigset);
0444     sigaddset(&sigset, RESUME_SIGNAL);
0445     pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
0446 #endif
0447     // sleep interrrupts on a handled delivery of the resume signal
0448     sleep(InitRootHandlers::stackTracePause());
0449 
0450     if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
0451       auto i = InitRootHandlers::nextModule_++;
0452       if (i < InitRootHandlers::moduleListBuffers_.size()) {
0453         char* buff = InitRootHandlers::moduleListBuffers_[i].data();
0454 
0455         strlcpy(buff, "\nModule: ", moduleBufferSize);
0456         if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0457           strlcat(buff,
0458                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0459                   moduleBufferSize);
0460           strlcat(buff, ":", moduleBufferSize);
0461           strlcat(buff,
0462                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0463                   moduleBufferSize);
0464         } else {
0465           strlcat(buff, "none", moduleBufferSize);
0466         }
0467         ++edm::service::InitRootHandlers::doneModules_;
0468       }
0469     }
0470   }
0471 
0472   void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
0473     using namespace edm::service;
0474 
0475     const auto& tids = InitRootHandlers::threadIDs();
0476 
0477     const auto self = pthread_self();
0478 #ifdef PAUSE_SIGNAL
0479     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0480       // install the "pause" handler
0481       struct sigaction act;
0482       act.sa_sigaction = sig_pause_for_stacktrace;
0483       act.sa_flags = 0;
0484       sigemptyset(&act.sa_mask);
0485       sigaction(PAUSE_SIGNAL, &act, nullptr);
0486 
0487       // unblock pause signal globally, resume is unblocked in the pause handler
0488       sigset_t pausesigset;
0489       sigemptyset(&pausesigset);
0490       sigaddset(&pausesigset, PAUSE_SIGNAL);
0491       sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
0492 
0493       // send a pause signal to all CMSSW/TBB threads other than self
0494       for (auto id : tids) {
0495         if (self != id) {
0496           pthread_kill(id, PAUSE_SIGNAL);
0497         }
0498       }
0499 
0500 #ifdef RESUME_SIGNAL
0501       // install the "resume" handler
0502       act.sa_sigaction = sig_resume_handler;
0503       sigaction(RESUME_SIGNAL, &act, nullptr);
0504 #endif
0505     }
0506 #endif
0507 
0508     const char* signalname = "unknown";
0509     switch (sig) {
0510       case SIGBUS: {
0511         signalname = "bus error";
0512         break;
0513       }
0514       case SIGSEGV: {
0515         signalname = "segmentation violation";
0516         break;
0517       }
0518       case SIGILL: {
0519         signalname = "illegal instruction";
0520         break;
0521       }
0522       case SIGTERM: {
0523         signalname = "external termination request";
0524         break;
0525       }
0526       case SIGABRT: {
0527         signalname = "abort signal";
0528         break;
0529       }
0530       default:
0531         break;
0532     }
0533     full_cerr_write("\n\nA fatal system signal has occurred: ");
0534     full_cerr_write(signalname);
0535     full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
0536 
0537     edm::service::InitRootHandlers::stacktraceFromThread();
0538 
0539     // resume the signal handlers to store the current module; we are not guaranteed they
0540     // will have time to store their modules, so there is a race condition; this could be
0541     // avoided by storing the module information before sleeping, a change that may be
0542     // made when we're convinced accessing the thread-local current module is safe.
0543 #ifdef RESUME_SIGNAL
0544     std::size_t notified = 0;
0545     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0546       for (auto id : tids) {
0547         if (self != id) {
0548           if (pthread_kill(id, RESUME_SIGNAL) == 0)
0549             ++notified;
0550         }
0551       }
0552     }
0553 #endif
0554 
0555     full_cerr_write("\nCurrent Modules:\n");
0556 
0557     // Checking tids.count(self) ensures that we only try to access the current module in
0558     // CMSSW/TBB threads.  Those threads access the thread-local current module at the same
0559     // time the thread is registered, so any lazy allocation will have been done at that
0560     // point.  Not necessary on Linux with the current cmsRun linkage, as the thread-local
0561     // is allocated at exec time, not lazily.
0562     if (tids.count(self) > 0) {
0563       char buff[moduleBufferSize] = "\nModule: ";
0564       if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0565         strlcat(buff,
0566                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0567                 moduleBufferSize);
0568         strlcat(buff, ":", moduleBufferSize);
0569         strlcat(buff,
0570                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0571                 moduleBufferSize);
0572       } else {
0573         strlcat(buff, "none", moduleBufferSize);
0574       }
0575       strlcat(buff, " (crashed)", moduleBufferSize);
0576       full_cerr_write(buff);
0577     } else {
0578       full_cerr_write("\nModule: non-CMSSW (crashed)");
0579     }
0580 
0581 #ifdef PAUSE_SIGNAL
0582     // wait a short interval for the paused threads to resume and fill in their module
0583     // information, then print
0584     if (InitRootHandlers::doneModules_.is_lock_free()) {
0585       int spincount = 0;
0586       timespec t = {0, 1000};
0587       while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
0588         nanosleep(&t, nullptr);
0589       }
0590       for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
0591         full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
0592       }
0593     }
0594 #endif
0595 
0596     full_cerr_write("\n\nA fatal system signal has occurred: ");
0597     full_cerr_write(signalname);
0598     full_cerr_write("\n");
0599 
0600     // For these five known cases, re-raise the signal to get the correct
0601     // exit code.
0602     if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGABRT)) {
0603       signal(sig, SIG_DFL);
0604       raise(sig);
0605     } else {
0606       set_default_signals();
0607       ::abort();
0608     }
0609   }
0610 
0611   void sig_abort(int sig, siginfo_t*, void*) {
0612     full_cerr_write("\n\nFatal system signal has occurred during exit\n");
0613 
0614     // re-raise the signal to get the correct exit code
0615     signal(sig, SIG_DFL);
0616     raise(sig);
0617 
0618     // shouldn't get here
0619     set_default_signals();
0620     ::sleep(10);
0621     ::abort();
0622   }
0623   }
0624 }  // end of unnamed namespace
0625 
0626 namespace edm {
0627   namespace service {
0628 
0629     /*
0630      * We've run into issues where GDB fails to print the thread which calls clone().
0631      * To avoid this problem, we have an alternate approach below where the signal handler
0632      * only reads/writes to a dedicated thread via pipes.  The helper thread does the clone()
0633      * invocation; we don't care if that thread is missing from the traceback in this case.
0634      */
0635     static void cmssw_stacktrace_fork();
0636 
0637     void InitRootHandlers::stacktraceHelperThread() {
0638       int toParent = childToParent_[1];
0639       int fromParent = parentToChild_[0];
0640       char buf[2];
0641       buf[1] = '\0';
0642 
0643       while (true) {
0644         int result = full_read(fromParent, buf, 1);
0645         if (result < 0) {
0646           // To avoid a deadlock (this function is NOT re-entrant), reset signals
0647           // We never set them back to the CMSSW handler because we assume the parent
0648           // thread will abort for us.
0649           set_default_signals();
0650           close(toParent);
0651           full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
0652           full_cerr_write(strerror(-result));
0653           full_cerr_write("\n");
0654           ::abort();
0655         }
0656         if (buf[0] == '1') {
0657           set_default_signals();
0658           cmssw_stacktrace_fork();
0659           full_write(toParent, buf);
0660         } else if (buf[0] == '2') {
0661           // We have just finished forking.  Reload the file descriptors for thread
0662           // communication.
0663           close(toParent);
0664           close(fromParent);
0665           toParent = childToParent_[1];
0666           fromParent = parentToChild_[0];
0667         } else if (buf[0] == '3') {
0668           break;
0669         } else {
0670           set_default_signals();
0671           close(toParent);
0672           full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
0673           full_cerr_write(buf);
0674           full_cerr_write("\n");
0675           ::abort();
0676         }
0677       }
0678     }
0679 
0680     void InitRootHandlers::stacktraceFromThread() {
0681       int result = full_write(parentToChild_[1], "1");
0682       if (result < 0) {
0683         full_cerr_write("\n\nAttempt to request stacktrace failed: ");
0684         full_cerr_write(strerror(-result));
0685         full_cerr_write("\n");
0686         return;
0687       }
0688       char buf[2];
0689       buf[1] = '\0';
0690       if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
0691         full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
0692         if (result == -ETIMEDOUT) {
0693           full_cerr_write("timed out waiting for GDB to complete.");
0694         } else {
0695           full_cerr_write(strerror(-result));
0696         }
0697         full_cerr_write("\n");
0698         return;
0699       }
0700     }
0701 
0702     void cmssw_stacktrace_fork() {
0703       char child_stack[4 * 1024];
0704       char* child_stack_ptr = child_stack + 4 * 1024;
0705       // On Linux, we currently use jemalloc.  This registers pthread_atfork handlers; these
0706       // handlers are *not* async-signal safe.  Hence, a deadlock is possible if we invoke
0707       // fork() from our signal handlers.  Accordingly, we use clone (not POSIX, but AS-safe)
0708       // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
0709       int pid =
0710 #ifdef __linux__
0711           clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
0712 #else
0713           fork();
0714       if (child_stack_ptr) {
0715       }  // Suppress 'unused variable' warning on non-Linux
0716       if (pid == 0) {
0717         edm::service::cmssw_stacktrace(nullptr);
0718       }
0719 #endif
0720       if (pid == -1) {
0721         full_cerr_write("(Attempt to perform stack dump failed.)\n");
0722       } else {
0723         int status;
0724         if (waitpid(pid, &status, 0) == -1) {
0725           full_cerr_write("(Failed to wait on stack dump output.)\n");
0726         }
0727         if (status) {
0728           full_cerr_write("(GDB stack trace failed unexpectedly)\n");
0729         }
0730       }
0731     }
0732 
0733     int cmssw_stacktrace(void* /*arg*/) {
0734       set_default_signals();
0735 
0736       char const* const* argv = edm::service::InitRootHandlers::getPstackArgv();
0737       // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
0738       // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
0739       // calls dlsym.
0740 #ifdef __linux__
0741       syscall(SYS_execve, "/bin/sh", argv, __environ);
0742 #else
0743       execv("/bin/sh", argv);
0744 #endif
0745       ::abort();
0746       return 1;
0747     }
0748 
0749     static constexpr char pstackName[] = "(CMSSW stack trace helper)";
0750     static constexpr char dashC[] = "-c";
0751     char InitRootHandlers::pidString_[InitRootHandlers::pidStringLength_] = {};
0752     char const* const InitRootHandlers::pstackArgv_[] = {pstackName, dashC, InitRootHandlers::pidString_, nullptr};
0753     int InitRootHandlers::parentToChild_[2] = {-1, -1};
0754     int InitRootHandlers::childToParent_[2] = {-1, -1};
0755     std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
0756     std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
0757     int InitRootHandlers::stackTracePause_ = 300;
0758     std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
0759     std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
0760 
0761     InitRootHandlers::InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg)
0762         : RootHandlers(),
0763           unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
0764           resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
0765           loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
0766           autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
0767           interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
0768       stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
0769 
0770       if (not threadTracker_) {
0771         threadTracker_ = std::make_unique<ThreadTracker>();
0772         iReg.watchPostEndJob([]() {
0773           if (threadTracker_) {
0774             threadTracker_->observe(false);
0775           }
0776         });
0777       }
0778 
0779       if (unloadSigHandler_) {
0780         // Deactivate all the Root signal handlers and restore the system defaults
0781         gSystem->ResetSignal(kSigChild);
0782         gSystem->ResetSignal(kSigBus);
0783         gSystem->ResetSignal(kSigSegmentationViolation);
0784         gSystem->ResetSignal(kSigIllegalInstruction);
0785         gSystem->ResetSignal(kSigSystem);
0786         gSystem->ResetSignal(kSigPipe);
0787         gSystem->ResetSignal(kSigAlarm);
0788         gSystem->ResetSignal(kSigUrgent);
0789         gSystem->ResetSignal(kSigFloatingException);
0790         gSystem->ResetSignal(kSigWindowChanged);
0791       } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
0792         cachePidInfo();
0793 
0794         //NOTE: ROOT can also be told to abort on these kinds of problems BUT
0795         // it requires an TApplication to be instantiated which causes problems
0796         gSystem->ResetSignal(kSigBus);
0797         gSystem->ResetSignal(kSigSegmentationViolation);
0798         gSystem->ResetSignal(kSigIllegalInstruction);
0799         installCustomHandler(SIGBUS, sig_dostack_then_abort);
0800         sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
0801         installCustomHandler(SIGSEGV, sig_dostack_then_abort);
0802         sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
0803         installCustomHandler(SIGILL, sig_dostack_then_abort);
0804         sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
0805         installCustomHandler(SIGTERM, sig_dostack_then_abort);
0806         sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
0807         installCustomHandler(SIGABRT, sig_dostack_then_abort);
0808         sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
0809           signal(SIGABRT, SIG_DFL);  // release SIGABRT to default
0810         });
0811       }
0812 
0813       iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
0814         if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
0815           moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
0816         }
0817       });
0818 
0819       if (resetErrHandler_) {
0820         // Replace the Root error handler with one that uses the MessageLogger
0821         SetErrorHandler(RootErrorHandler);
0822       }
0823 
0824       // Enable automatic Root library loading.
0825       if (autoLibraryLoader_) {
0826         gInterpreter->SetClassAutoloading(1);
0827       }
0828 
0829       // Set ROOT parameters.
0830       TTree::SetMaxTreeSize(kMaxLong64);
0831       TH1::AddDirectory(kFALSE);
0832       //G__SetCatchException(0);
0833 
0834       // Set custom streamers
0835       setRefCoreStreamerInTClass();
0836 
0837       // Load the library containing dictionaries for std:: classes, if not already loaded.
0838       if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
0839         TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
0840       }
0841 
0842       int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
0843       if (debugLevel > 0) {
0844         gDebug = debugLevel;
0845       }
0846 
0847       // Enable Root implicit multi-threading
0848       bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
0849       if (imt && not ROOT::IsImplicitMTEnabled()) {
0850         //cmsRun uses global_control to set the number of allowed threads to use
0851         // we need to tell ROOT the same value in order to avoid unnecessary warnings
0852         ROOT::EnableImplicitMT(
0853             oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
0854       }
0855     }
0856 
0857     InitRootHandlers::~InitRootHandlers() {
0858       // close all open ROOT files
0859       TIter iter(gROOT->GetListOfFiles());
0860       TObject* obj = nullptr;
0861       while (nullptr != (obj = iter.Next())) {
0862         TFile* f = dynamic_cast<TFile*>(obj);
0863         if (f) {
0864           // We get a new iterator each time,
0865           // because closing a file can invalidate the iterator
0866           f->Close();
0867           iter = TIter(gROOT->GetListOfFiles());
0868         }
0869       }
0870       //disengage from TBB to avoid possible at exit problems
0871       threadTracker_.reset();
0872     }
0873 
0874     void InitRootHandlers::willBeUsingThreads() {
0875       //Tell Root we want to be multi-threaded
0876       ROOT::EnableThreadSafety();
0877 
0878       //When threading, also have to keep ROOT from logging all TObjects into a list
0879       TObject::SetObjectStat(false);
0880 
0881       //Have to avoid having Streamers modify themselves after they have been used
0882       TVirtualStreamerInfo::Optimize(false);
0883     }
0884 
0885     void InitRootHandlers::fillDescriptions(ConfigurationDescriptions& descriptions) {
0886       ParameterSetDescription desc;
0887       desc.setComment("Centralized interface to ROOT.");
0888       desc.addUntracked<bool>("UnloadRootSigHandler", false)
0889           ->setComment("If True, signals are handled by this service, rather than by ROOT.");
0890       desc.addUntracked<bool>("ResetRootErrHandler", true)
0891           ->setComment(
0892               "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
0893       desc.addUntracked<bool>("AutoLibraryLoader", true)
0894           ->setComment("If True, enables automatic loading of data dictionaries.");
0895       desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
0896       desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
0897       desc.addUntracked<bool>("AbortOnSignal", true)
0898           ->setComment(
0899               "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
0900               "attempts to do a clean shutdown.");
0901       desc.addUntracked<bool>("InteractiveDebug", false)
0902           ->setComment(
0903               "If True, leave gdb attached to cmsRun after a crash; "
0904               "if False, attach gdb, print a stack trace, and quit gdb");
0905       desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
0906       desc.addUntracked<int>("StackTracePauseTime", 300)
0907           ->setComment("Seconds to pause other threads during stack trace.");
0908       descriptions.add("InitRootHandlers", desc);
0909     }
0910 
0911     char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
0912 
0913     void InitRootHandlers::enableWarnings_() { s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo; }
0914 
0915     void InitRootHandlers::ignoreWarnings_(edm::RootHandlers::SeverityLevel level) { s_ignoreWarnings = level; }
0916 
0917     void InitRootHandlers::cachePidInfo() {
0918       if (helperThread_) {
0919         //Another InitRootHandlers was initialized in this job, possibly
0920         // because multiple EventProcessors are being used.
0921         //In that case, we are already all setup
0922         return;
0923       }
0924       std::string gdbcmd{"date; gdb -quiet -p %d"};
0925       if (!interactiveDebug_) {
0926         gdbcmd +=
0927             " 2>&1 <<EOF |\n"
0928             "set width 0\n"
0929             "set height 0\n"
0930             "set pagination no\n"
0931             "thread apply all bt\n"
0932             "EOF\n"
0933             "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
0934       }
0935       if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
0936         std::ostringstream sstr;
0937         sstr << "Unable to pre-allocate stacktrace handler information";
0938         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0939         throw except;
0940       }
0941 
0942       // These are initialized to -1; harmless to close an invalid FD.
0943       // If this is called post-fork, we don't want to be communicating on
0944       // these FDs as they are used internally by the parent.
0945       close(childToParent_[0]);
0946       close(childToParent_[1]);
0947       childToParent_[0] = -1;
0948       childToParent_[1] = -1;
0949       close(parentToChild_[0]);
0950       close(parentToChild_[1]);
0951       parentToChild_[0] = -1;
0952       parentToChild_[1] = -1;
0953 
0954       if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
0955         std::ostringstream sstr;
0956         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0957         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0958         throw except;
0959       }
0960 
0961       if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
0962         close(childToParent_[0]);
0963         close(childToParent_[1]);
0964         childToParent_[0] = -1;
0965         childToParent_[1] = -1;
0966         std::ostringstream sstr;
0967         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0968         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0969         throw except;
0970       }
0971 
0972       helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
0973       helperThread_->detach();
0974     }
0975 
0976   }  // end of namespace service
0977 }  // end of namespace edm
0978 
0979 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0980 
0981 using edm::service::InitRootHandlers;
0982 typedef edm::serviceregistry::AllArgsMaker<edm::RootHandlers, InitRootHandlers> RootHandlersMaker;
0983 DEFINE_FWK_SERVICE_MAKER(InitRootHandlers, RootHandlersMaker);