Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-03-07 01:53:27

0001 #include "FWCore/AbstractServices/interface/RootHandlers.h"
0002 
0003 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0004 #include "FWCore/ServiceRegistry/interface/SystemBounds.h"
0005 #include "DataFormats/Common/interface/RefCoreStreamer.h"
0006 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0007 #include "FWCore/MessageLogger/interface/ELseverityLevel.h"
0008 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0010 #include "FWCore/PluginManager/interface/PluginCapabilities.h"
0011 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0012 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0013 #include "FWCore/Utilities/interface/EDMException.h"
0014 #include "FWCore/Reflection/interface/TypeWithDict.h"
0015 #include "FWCore/Utilities/interface/UnixSignalHandlers.h"
0016 #include "FWCore/ServiceRegistry/interface/CurrentModuleOnThread.h"
0017 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0018 
0019 #include "oneapi/tbb/concurrent_unordered_set.h"
0020 #include "oneapi/tbb/task.h"
0021 #include "oneapi/tbb/task_scheduler_observer.h"
0022 #include "oneapi/tbb/global_control.h"
0023 #include <memory>
0024 
0025 #include <thread>
0026 #include <sys/wait.h>
0027 #include <sstream>
0028 #include <cstring>
0029 #include <poll.h>
0030 #include <atomic>
0031 #include <algorithm>
0032 #include <vector>
0033 #include <string>
0034 #include <array>
0035 
0036 // WORKAROUND: At CERN, execv is replaced with a non-async-signal safe
0037 // version.  This can break our stack trace printer.  Avoid this by
0038 // invoking the syscall directly.
0039 #ifdef __linux__
0040 #include <syscall.h>
0041 #endif
0042 
0043 #include "TROOT.h"
0044 #include "TError.h"
0045 #include "TFile.h"
0046 #include "TInterpreter.h"
0047 #include "TH1.h"
0048 #include "TSystem.h"
0049 #include "TUnixSystem.h"
0050 #include "TTree.h"
0051 #include "TVirtualStreamerInfo.h"
0052 
0053 #include "TClassTable.h"
0054 
0055 #include <memory>
0056 
0057 namespace {
0058   // size of static buffer allocated for listing module names following a
0059   // stacktrace abort
0060   constexpr std::size_t moduleBufferSize = 128;
0061 }  // namespace
0062 
0063 namespace edm {
0064   class ConfigurationDescriptions;
0065   class ParameterSet;
0066   class ActivityRegistry;
0067 
0068   namespace service {
0069     class InitRootHandlers : public RootHandlers {
0070       friend int cmssw_stacktrace(void*);
0071 
0072     public:
0073       class ThreadTracker : public oneapi::tbb::task_scheduler_observer {
0074       public:
0075         typedef oneapi::tbb::concurrent_unordered_set<pthread_t> Container_type;
0076 
0077         ThreadTracker() : oneapi::tbb::task_scheduler_observer() { observe(); }
0078         ~ThreadTracker() override = default;
0079 
0080         void on_scheduler_entry(bool) override {
0081           // ensure thread local has been allocated; not necessary on Linux with
0082           // the current cmsRun linkage, but could be an issue if the platform
0083           // or linkage leads to "lazy" allocation of the thread local.  By
0084           // referencing it here we make sure it has been allocated and can be
0085           // accessed safely from our signal handler.
0086           edm::CurrentModuleOnThread::getCurrentModuleOnThread();
0087           threadIDs_.insert(pthread_self());
0088         }
0089         void on_scheduler_exit(bool) override {}
0090         const Container_type& IDs() { return threadIDs_; }
0091 
0092       private:
0093         Container_type threadIDs_;
0094       };
0095 
0096       explicit InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg);
0097       ~InitRootHandlers() override;
0098 
0099       static void fillDescriptions(ConfigurationDescriptions& descriptions);
0100       static void stacktraceFromThread();
0101       static const ThreadTracker::Container_type& threadIDs() {
0102         static const ThreadTracker::Container_type empty;
0103         if (threadTracker_) {
0104           return threadTracker_->IDs();
0105         }
0106         return empty;
0107       }
0108       static int stackTracePause() { return stackTracePause_; }
0109 
0110       static std::vector<std::array<char, moduleBufferSize>> moduleListBuffers_;
0111       static std::atomic<std::size_t> nextModule_, doneModules_;
0112 
0113     private:
0114       static char const* const* getPstackArgv();
0115       void enableWarnings_() override;
0116       void ignoreWarnings_(edm::RootHandlers::SeverityLevel level) override;
0117       void willBeUsingThreads() override;
0118 
0119       void cachePidInfo();
0120       static void stacktraceHelperThread();
0121 
0122       static constexpr int pidStringLength_ = 200;
0123       static char pidString_[pidStringLength_];
0124       static char const* const pstackArgv_[];
0125       static int parentToChild_[2];
0126       static int childToParent_[2];
0127       static std::unique_ptr<std::thread> helperThread_;
0128       static std::unique_ptr<ThreadTracker> threadTracker_;
0129       static int stackTracePause_;
0130 
0131       bool unloadSigHandler_;
0132       bool resetErrHandler_;
0133       bool loadAllDictionaries_;
0134       bool autoLibraryLoader_;
0135       bool autoClassParser_;
0136       bool interactiveDebug_;
0137       std::shared_ptr<const void> sigBusHandler_;
0138       std::shared_ptr<const void> sigSegvHandler_;
0139       std::shared_ptr<const void> sigIllHandler_;
0140       std::shared_ptr<const void> sigTermHandler_;
0141       std::shared_ptr<const void> sigAbrtHandler_;
0142       std::shared_ptr<const void> sigFpeHandler_;
0143     };
0144 
0145     inline bool isProcessWideService(InitRootHandlers const*) { return true; }
0146 
0147   }  // end of namespace service
0148 }  // end of namespace edm
0149 
0150 namespace edm {
0151   namespace service {
0152     int cmssw_stacktrace(void*);
0153   }
0154 }  // namespace edm
0155 
0156 namespace {
0157   thread_local edm::RootHandlers::SeverityLevel s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo;
0158 
0159   constexpr bool s_ignoreEverything = false;
0160 
0161   template <std::size_t SIZE>
0162   bool find_if_string(const std::string& search, const std::array<const char* const, SIZE>& substrs) {
0163     return (std::find_if(substrs.begin(), substrs.end(), [&search](const char* const s) -> bool {
0164               return (search.find(s) != std::string::npos);
0165             }) != substrs.end());
0166   }
0167 
0168   //Contents of a message which should be reported as an INFO not a ERROR
0169   constexpr std::array<const char* const, 11> in_message{
0170       {"no dictionary for class",
0171        "already in TClassTable",
0172        "matrix not positive definite",
0173        "not a TStreamerInfo object",
0174        "Problems declaring payload",
0175        "Announced number of args different from the real number of argument passed",  // Always printed if gDebug>0 - regardless of whether warning message is real.
0176        "nbins is <=0 - set to nbins = 1",
0177        "nbinsy is <=0 - set to nbinsy = 1",
0178        "oneapi::tbb::global_control is limiting",
0179        "ufirst < fXmin, fXmin is used",
0180        "ulast > fXmax, fXmax is used"}};
0181 
0182   //Location generating messages which should be reported as an INFO not a ERROR
0183   constexpr std::array<const char* const, 7> in_location{{"Fit",
0184                                                           "TDecompChol::Solve",
0185                                                           "THistPainter::PaintInit",
0186                                                           "TUnixSystem::SetDisplay",
0187                                                           "TGClient::GetFontByName",
0188                                                           "Inverter::Dinv",
0189                                                           "RTaskArenaWrapper"}};
0190 
0191   constexpr std::array<const char* const, 4> in_message_print_error{
0192       {"number of iterations was insufficient",
0193        "bad integrand behavior",
0194        "integral is divergent, or slowly convergent",
0195        "VariableMetricBuilder Initial matrix not pos.def."}};
0196 
0197   void RootErrorHandlerImpl(int level, char const* location, char const* message) {
0198     bool die = false;
0199 
0200     // Translate ROOT severity level to MessageLogger severity level
0201 
0202     edm::RootHandlers::SeverityLevel el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0203 
0204     if (level >= kFatal) {
0205       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0206     } else if (level >= kSysError) {
0207       el_severity = edm::RootHandlers::SeverityLevel::kSysError;
0208     } else if (level >= kError) {
0209       el_severity = edm::RootHandlers::SeverityLevel::kError;
0210     } else if (level >= kWarning) {
0211       el_severity = edm::RootHandlers::SeverityLevel::kWarning;
0212     }
0213 
0214     if (s_ignoreEverything || el_severity <= s_ignoreWarnings) {
0215       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0216     }
0217 
0218     // Adapt C-strings to std::strings
0219     // Arrange to report the error location as furnished by Root
0220 
0221     std::string el_location = "@SUB=?";
0222     if (location != nullptr)
0223       el_location = std::string("@SUB=") + std::string(location);
0224 
0225     std::string el_message = "?";
0226     if (message != nullptr)
0227       el_message = message;
0228 
0229     // Try to create a meaningful id string using knowledge of ROOT error messages
0230     //
0231     // id ==     "ROOT-ClassName" where ClassName is the affected class
0232     //      else "ROOT/ClassName" where ClassName is the error-declaring class
0233     //      else "ROOT"
0234 
0235     std::string el_identifier = "ROOT";
0236 
0237     std::string precursor("class ");
0238     size_t index1 = el_message.find(precursor);
0239     if (index1 != std::string::npos) {
0240       size_t index2 = index1 + precursor.length();
0241       size_t index3 = el_message.find_first_of(" :", index2);
0242       if (index3 != std::string::npos) {
0243         size_t substrlen = index3 - index2;
0244         el_identifier += "-";
0245         el_identifier += el_message.substr(index2, substrlen);
0246       }
0247     } else {
0248       index1 = el_location.find("::");
0249       if (index1 != std::string::npos) {
0250         el_identifier += "/";
0251         el_identifier += el_location.substr(0, index1);
0252       }
0253     }
0254 
0255     // Intercept some messages and upgrade the severity
0256 
0257     if ((el_location.find("TBranchElement::Fill") != std::string::npos) &&
0258         (el_message.find("fill branch") != std::string::npos) && (el_message.find("address") != std::string::npos) &&
0259         (el_message.find("not set") != std::string::npos)) {
0260       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0261     }
0262 
0263     if ((el_message.find("Tree branches") != std::string::npos) &&
0264         (el_message.find("different numbers of entries") != std::string::npos)) {
0265       el_severity = edm::RootHandlers::SeverityLevel::kFatal;
0266     }
0267 
0268     // Intercept some messages and downgrade the severity
0269 
0270     if (find_if_string(el_message, in_message) || find_if_string(el_location, in_location) ||
0271         (level < kError and (el_location.find("CINTTypedefBuilder::Setup") != std::string::npos) and
0272          (el_message.find("possible entries are in use!") != std::string::npos))) {
0273       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0274     }
0275 
0276     // These are a special case because we do not want them to
0277     // be fatal, but we do want an error to print.
0278     bool alreadyPrinted = false;
0279     if (find_if_string(el_message, in_message_print_error)) {
0280       el_severity = edm::RootHandlers::SeverityLevel::kInfo;
0281       edm::LogError("Root_Error") << el_location << el_message;
0282       alreadyPrinted = true;
0283     }
0284 
0285     if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0286       // Don't throw if the message is just informational.
0287       die = false;
0288     } else {
0289       die = true;
0290     }
0291 
0292     // Feed the message to the MessageLogger and let it choose to suppress or not.
0293 
0294     // Root has declared a fatal error.  Throw an EDMException unless the
0295     // message corresponds to a pending signal. In that case, do not throw
0296     // but let the OS deal with the signal in the usual way.
0297     if (die && (el_location != std::string("@SUB=TUnixSystem::DispatchSignals"))) {
0298       std::ostringstream sstr;
0299       sstr << "Fatal Root Error: " << el_location << "\n" << el_message << '\n';
0300       edm::Exception except(edm::errors::FatalRootError, sstr.str());
0301       except.addAdditionalInfo(except.message());
0302       except.clearMessage();
0303       throw except;
0304     }
0305 
0306     // Typically, we get here only for informational messages,
0307     // but we leave the other code in just in case we change
0308     // the criteria for throwing.
0309     if (!alreadyPrinted) {
0310       if (el_severity == edm::RootHandlers::SeverityLevel::kFatal) {
0311         edm::LogError("Root_Fatal") << el_location << el_message;
0312       } else if (el_severity == edm::RootHandlers::SeverityLevel::kSysError) {
0313         edm::LogError("Root_Severe") << el_location << el_message;
0314       } else if (el_severity == edm::RootHandlers::SeverityLevel::kError) {
0315         edm::LogError("Root_Error") << el_location << el_message;
0316       } else if (el_severity == edm::RootHandlers::SeverityLevel::kWarning) {
0317         edm::LogWarning("Root_Warning") << el_location << el_message;
0318       } else if (el_severity == edm::RootHandlers::SeverityLevel::kInfo) {
0319         edm::LogInfo("Root_Information") << el_location << el_message;
0320       }
0321     }
0322   }
0323 
0324   void RootErrorHandler(int level, bool, char const* location, char const* message) {
0325     RootErrorHandlerImpl(level, location, message);
0326   }
0327 
0328   extern "C" {
0329   void set_default_signals() {
0330     signal(SIGILL, SIG_DFL);
0331     signal(SIGSEGV, SIG_DFL);
0332     signal(SIGBUS, SIG_DFL);
0333     signal(SIGTERM, SIG_DFL);
0334     signal(SIGFPE, SIG_DFL);
0335     signal(SIGABRT, SIG_DFL);
0336   }
0337 
0338   static int full_write(int fd, const char* text) {
0339     const char* buffer = text;
0340     size_t count = strlen(text);
0341     ssize_t written = 0;
0342     while (count) {
0343       written = write(fd, buffer, count);
0344       if (written == -1) {
0345         if (errno == EINTR) {
0346           continue;
0347         } else {
0348           return -errno;
0349         }
0350       }
0351       count -= written;
0352       buffer += written;
0353     }
0354     return 0;
0355   }
0356 
0357   static int full_read(int fd, char* inbuf, size_t len, int timeout_s = -1) {
0358     char* buf = inbuf;
0359     size_t count = len;
0360     ssize_t complete = 0;
0361     std::chrono::time_point<std::chrono::steady_clock> end_time =
0362         std::chrono::steady_clock::now() + std::chrono::seconds(timeout_s);
0363     int flags;
0364     if (timeout_s < 0) {
0365       flags = O_NONBLOCK;  // Prevents us from trying to set / restore flags later.
0366     } else if ((-1 == (flags = fcntl(fd, F_GETFL)))) {
0367       return -errno;
0368     }
0369     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0370       if (-1 == fcntl(fd, F_SETFL, flags | O_NONBLOCK)) {
0371         return -errno;
0372       }
0373     }
0374     while (count) {
0375       if (timeout_s >= 0) {
0376         struct pollfd poll_info {
0377           fd, POLLIN, 0
0378         };
0379         int ms_remaining =
0380             std::chrono::duration_cast<std::chrono::milliseconds>(end_time - std::chrono::steady_clock::now()).count();
0381         if (ms_remaining > 0) {
0382           int rc = poll(&poll_info, 1, ms_remaining);
0383           if (rc <= 0) {
0384             if (rc < 0) {
0385               if (errno == EINTR || errno == EAGAIN) {
0386                 continue;
0387               }
0388               rc = -errno;
0389             } else {
0390               rc = -ETIMEDOUT;
0391             }
0392             if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0393               fcntl(fd, F_SETFL, flags);
0394             }
0395             return rc;
0396           }
0397         } else if (ms_remaining < 0) {
0398           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0399             fcntl(fd, F_SETFL, flags);
0400           }
0401           return -ETIMEDOUT;
0402         }
0403       }
0404       complete = read(fd, buf, count);
0405       if (complete == -1) {
0406         if (errno == EINTR) {
0407           continue;
0408         } else if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
0409           continue;
0410         } else {
0411           int orig_errno = errno;
0412           if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0413             fcntl(fd, F_SETFL, flags);
0414           }
0415           return -orig_errno;
0416         }
0417       }
0418       count -= complete;
0419       buf += complete;
0420     }
0421     if ((flags & O_NONBLOCK) != O_NONBLOCK) {
0422       fcntl(fd, F_SETFL, flags);
0423     }
0424     return 0;
0425   }
0426 
0427   static int full_cerr_write(const char* text) { return full_write(2, text); }
0428 
0429 // these signals are only used inside the stacktrace signal handler,
0430 // so common signals can be used.  They do have to be different, since
0431 // we do not set SA_NODEFER, and RESUME must be a signal that will
0432 // cause sleep() to return early.
0433 #if defined(SIGRTMAX)
0434 #define PAUSE_SIGNAL SIGRTMAX
0435 #define RESUME_SIGNAL SIGRTMAX - 1
0436 #elif defined(SIGINFO)  // macOS/BSD
0437 #define PAUSE_SIGNAL SIGINFO
0438 #define RESUME_SIGNAL SIGALRM
0439 #endif
0440 
0441   // does nothing, here only to interrupt the sleep() in the pause handler
0442   void sig_resume_handler(int sig, siginfo_t*, void*) {}
0443 
0444   // pause a thread so that a (slow) stacktrace will capture the current state
0445   void sig_pause_for_stacktrace(int sig, siginfo_t*, void*) {
0446     using namespace edm::service;
0447 
0448 #ifdef RESUME_SIGNAL
0449     sigset_t sigset;
0450     sigemptyset(&sigset);
0451     sigaddset(&sigset, RESUME_SIGNAL);
0452     pthread_sigmask(SIG_UNBLOCK, &sigset, nullptr);
0453 #endif
0454     // sleep interrrupts on a handled delivery of the resume signal
0455     sleep(InitRootHandlers::stackTracePause());
0456 
0457     if (InitRootHandlers::doneModules_.is_lock_free() && InitRootHandlers::nextModule_.is_lock_free()) {
0458       auto i = InitRootHandlers::nextModule_++;
0459       if (i < InitRootHandlers::moduleListBuffers_.size()) {
0460         char* buff = InitRootHandlers::moduleListBuffers_[i].data();
0461 
0462         strlcpy(buff, "\nModule: ", moduleBufferSize);
0463         if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0464           strlcat(buff,
0465                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0466                   moduleBufferSize);
0467           strlcat(buff, ":", moduleBufferSize);
0468           strlcat(buff,
0469                   edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0470                   moduleBufferSize);
0471         } else {
0472           strlcat(buff, "none", moduleBufferSize);
0473         }
0474         ++edm::service::InitRootHandlers::doneModules_;
0475       }
0476     }
0477   }
0478 
0479   void sig_dostack_then_abort(int sig, siginfo_t*, void*) {
0480     using namespace edm::service;
0481 
0482     const auto& tids = InitRootHandlers::threadIDs();
0483 
0484     const auto self = pthread_self();
0485 #ifdef PAUSE_SIGNAL
0486     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0487       // install the "pause" handler
0488       struct sigaction act;
0489       act.sa_sigaction = sig_pause_for_stacktrace;
0490       act.sa_flags = 0;
0491       sigemptyset(&act.sa_mask);
0492       sigaction(PAUSE_SIGNAL, &act, nullptr);
0493 
0494       // unblock pause signal globally, resume is unblocked in the pause handler
0495       sigset_t pausesigset;
0496       sigemptyset(&pausesigset);
0497       sigaddset(&pausesigset, PAUSE_SIGNAL);
0498       sigprocmask(SIG_UNBLOCK, &pausesigset, nullptr);
0499 
0500       // send a pause signal to all CMSSW/TBB threads other than self
0501       for (auto id : tids) {
0502         if (self != id) {
0503           pthread_kill(id, PAUSE_SIGNAL);
0504         }
0505       }
0506 
0507 #ifdef RESUME_SIGNAL
0508       // install the "resume" handler
0509       act.sa_sigaction = sig_resume_handler;
0510       sigaction(RESUME_SIGNAL, &act, nullptr);
0511 #endif
0512     }
0513 #endif
0514 
0515     const char* signalname = "unknown";
0516     switch (sig) {
0517       case SIGBUS: {
0518         signalname = "bus error";
0519         break;
0520       }
0521       case SIGSEGV: {
0522         signalname = "segmentation violation";
0523         break;
0524       }
0525       case SIGILL: {
0526         signalname = "illegal instruction";
0527         break;
0528       }
0529       case SIGFPE: {
0530         signalname = "floating point exception";
0531         break;
0532       }
0533       case SIGTERM: {
0534         signalname = "external termination request";
0535         break;
0536       }
0537       case SIGABRT: {
0538         signalname = "abort signal";
0539         break;
0540       }
0541       default:
0542         break;
0543     }
0544     full_cerr_write("\n\nA fatal system signal has occurred: ");
0545     full_cerr_write(signalname);
0546     full_cerr_write("\nThe following is the call stack containing the origin of the signal.\n\n");
0547 
0548     edm::service::InitRootHandlers::stacktraceFromThread();
0549 
0550     // resume the signal handlers to store the current module; we are not guaranteed they
0551     // will have time to store their modules, so there is a race condition; this could be
0552     // avoided by storing the module information before sleeping, a change that may be
0553     // made when we're convinced accessing the thread-local current module is safe.
0554 #ifdef RESUME_SIGNAL
0555     std::size_t notified = 0;
0556     if (InitRootHandlers::stackTracePause() > 0 && tids.size() > 1) {
0557       for (auto id : tids) {
0558         if (self != id) {
0559           if (pthread_kill(id, RESUME_SIGNAL) == 0)
0560             ++notified;
0561         }
0562       }
0563     }
0564 #endif
0565 
0566     full_cerr_write("\nCurrent Modules:\n");
0567 
0568     // Checking tids.count(self) ensures that we only try to access the current module in
0569     // CMSSW/TBB threads.  Those threads access the thread-local current module at the same
0570     // time the thread is registered, so any lazy allocation will have been done at that
0571     // point.  Not necessary on Linux with the current cmsRun linkage, as the thread-local
0572     // is allocated at exec time, not lazily.
0573     if (tids.count(self) > 0) {
0574       char buff[moduleBufferSize] = "\nModule: ";
0575       if (edm::CurrentModuleOnThread::getCurrentModuleOnThread() != nullptr) {
0576         strlcat(buff,
0577                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleName().c_str(),
0578                 moduleBufferSize);
0579         strlcat(buff, ":", moduleBufferSize);
0580         strlcat(buff,
0581                 edm::CurrentModuleOnThread::getCurrentModuleOnThread()->moduleDescription()->moduleLabel().c_str(),
0582                 moduleBufferSize);
0583       } else {
0584         strlcat(buff, "none", moduleBufferSize);
0585       }
0586       strlcat(buff, " (crashed)", moduleBufferSize);
0587       full_cerr_write(buff);
0588     } else {
0589       full_cerr_write("\nModule: non-CMSSW (crashed)");
0590     }
0591 
0592 #ifdef PAUSE_SIGNAL
0593     // wait a short interval for the paused threads to resume and fill in their module
0594     // information, then print
0595     if (InitRootHandlers::doneModules_.is_lock_free()) {
0596       int spincount = 0;
0597       timespec t = {0, 1000};
0598       while (++spincount < 1000 && InitRootHandlers::doneModules_ < notified) {
0599         nanosleep(&t, nullptr);
0600       }
0601       for (std::size_t i = 0; i < InitRootHandlers::doneModules_; ++i) {
0602         full_cerr_write(InitRootHandlers::moduleListBuffers_[i].data());
0603       }
0604     }
0605 #endif
0606 
0607     full_cerr_write("\n\nA fatal system signal has occurred: ");
0608     full_cerr_write(signalname);
0609     full_cerr_write("\n");
0610 
0611     // For these known cases, re-raise the signal to get the correct
0612     // exit code.
0613     if ((sig == SIGILL) || (sig == SIGSEGV) || (sig == SIGBUS) || (sig == SIGTERM) || (sig == SIGFPE) ||
0614         (sig == SIGABRT)) {
0615       signal(sig, SIG_DFL);
0616       raise(sig);
0617     } else {
0618       set_default_signals();
0619       ::abort();
0620     }
0621   }
0622 
0623   void sig_abort(int sig, siginfo_t*, void*) {
0624     full_cerr_write("\n\nFatal system signal has occurred during exit\n");
0625 
0626     // re-raise the signal to get the correct exit code
0627     signal(sig, SIG_DFL);
0628     raise(sig);
0629 
0630     // shouldn't get here
0631     set_default_signals();
0632     ::sleep(10);
0633     ::abort();
0634   }
0635   }
0636 }  // end of unnamed namespace
0637 
0638 namespace edm {
0639   namespace service {
0640 
0641     /*
0642      * We've run into issues where GDB fails to print the thread which calls clone().
0643      * To avoid this problem, we have an alternate approach below where the signal handler
0644      * only reads/writes to a dedicated thread via pipes.  The helper thread does the clone()
0645      * invocation; we don't care if that thread is missing from the traceback in this case.
0646      */
0647     static void cmssw_stacktrace_fork();
0648 
0649     void InitRootHandlers::stacktraceHelperThread() {
0650       int toParent = childToParent_[1];
0651       int fromParent = parentToChild_[0];
0652       char buf[2];
0653       buf[1] = '\0';
0654 
0655       while (true) {
0656         int result = full_read(fromParent, buf, 1);
0657         if (result < 0) {
0658           // To avoid a deadlock (this function is NOT re-entrant), reset signals
0659           // We never set them back to the CMSSW handler because we assume the parent
0660           // thread will abort for us.
0661           set_default_signals();
0662           close(toParent);
0663           full_cerr_write("\n\nTraceback helper thread failed to read from parent: ");
0664           full_cerr_write(strerror(-result));
0665           full_cerr_write("\n");
0666           ::abort();
0667         }
0668         if (buf[0] == '1') {
0669           set_default_signals();
0670           cmssw_stacktrace_fork();
0671           full_write(toParent, buf);
0672         } else if (buf[0] == '2') {
0673           // We have just finished forking.  Reload the file descriptors for thread
0674           // communication.
0675           close(toParent);
0676           close(fromParent);
0677           toParent = childToParent_[1];
0678           fromParent = parentToChild_[0];
0679         } else if (buf[0] == '3') {
0680           break;
0681         } else {
0682           set_default_signals();
0683           close(toParent);
0684           full_cerr_write("\n\nTraceback helper thread got unknown command from parent: ");
0685           full_cerr_write(buf);
0686           full_cerr_write("\n");
0687           ::abort();
0688         }
0689       }
0690     }
0691 
0692     void InitRootHandlers::stacktraceFromThread() {
0693       int result = full_write(parentToChild_[1], "1");
0694       if (result < 0) {
0695         full_cerr_write("\n\nAttempt to request stacktrace failed: ");
0696         full_cerr_write(strerror(-result));
0697         full_cerr_write("\n");
0698         return;
0699       }
0700       char buf[2];
0701       buf[1] = '\0';
0702       if ((result = full_read(childToParent_[0], buf, 1, 5 * 60)) < 0) {
0703         full_cerr_write("\n\nWaiting for stacktrace completion failed: ");
0704         if (result == -ETIMEDOUT) {
0705           full_cerr_write("timed out waiting for GDB to complete.");
0706         } else {
0707           full_cerr_write(strerror(-result));
0708         }
0709         full_cerr_write("\n");
0710         return;
0711       }
0712     }
0713 
0714     void cmssw_stacktrace_fork() {
0715       char child_stack[4 * 1024];
0716       char* child_stack_ptr = child_stack + 4 * 1024;
0717       // On Linux, we currently use jemalloc.  This registers pthread_atfork handlers; these
0718       // handlers are *not* async-signal safe.  Hence, a deadlock is possible if we invoke
0719       // fork() from our signal handlers.  Accordingly, we use clone (not POSIX, but AS-safe)
0720       // as that is closer to the 'raw metal' syscall and avoids pthread_atfork handlers.
0721       int pid =
0722 #ifdef __linux__
0723           clone(edm::service::cmssw_stacktrace, child_stack_ptr, CLONE_VM | CLONE_FS | SIGCHLD, nullptr);
0724 #else
0725           fork();
0726       if (child_stack_ptr) {
0727       }  // Suppress 'unused variable' warning on non-Linux
0728       if (pid == 0) {
0729         edm::service::cmssw_stacktrace(nullptr);
0730       }
0731 #endif
0732       if (pid == -1) {
0733         full_cerr_write("(Attempt to perform stack dump failed.)\n");
0734       } else {
0735         int status;
0736         if (waitpid(pid, &status, 0) == -1) {
0737           full_cerr_write("(Failed to wait on stack dump output.)\n");
0738         }
0739         if (status) {
0740           full_cerr_write("(GDB stack trace failed unexpectedly)\n");
0741         }
0742       }
0743     }
0744 
0745     int cmssw_stacktrace(void* /*arg*/) {
0746       set_default_signals();
0747 
0748       char const* const* argv = edm::service::InitRootHandlers::getPstackArgv();
0749       // NOTE: this is NOT async-signal-safe at CERN's lxplus service.
0750       // CERN uses LD_PRELOAD to replace execv with a function from libsnoopy which
0751       // calls dlsym.
0752 #ifdef __linux__
0753       syscall(SYS_execve, "/bin/sh", argv, __environ);
0754 #else
0755       execv("/bin/sh", argv);
0756 #endif
0757       ::abort();
0758       return 1;
0759     }
0760 
0761     static constexpr char pstackName[] = "(CMSSW stack trace helper)";
0762     static constexpr char dashC[] = "-c";
0763     char InitRootHandlers::pidString_[InitRootHandlers::pidStringLength_] = {};
0764     char const* const InitRootHandlers::pstackArgv_[] = {pstackName, dashC, InitRootHandlers::pidString_, nullptr};
0765     int InitRootHandlers::parentToChild_[2] = {-1, -1};
0766     int InitRootHandlers::childToParent_[2] = {-1, -1};
0767     std::unique_ptr<std::thread> InitRootHandlers::helperThread_;
0768     std::unique_ptr<InitRootHandlers::ThreadTracker> InitRootHandlers::threadTracker_;
0769     int InitRootHandlers::stackTracePause_ = 300;
0770     std::vector<std::array<char, moduleBufferSize>> InitRootHandlers::moduleListBuffers_;
0771     std::atomic<std::size_t> InitRootHandlers::nextModule_(0), InitRootHandlers::doneModules_(0);
0772 
0773     InitRootHandlers::InitRootHandlers(ParameterSet const& pset, ActivityRegistry& iReg)
0774         : RootHandlers(),
0775           unloadSigHandler_(pset.getUntrackedParameter<bool>("UnloadRootSigHandler")),
0776           resetErrHandler_(pset.getUntrackedParameter<bool>("ResetRootErrHandler")),
0777           loadAllDictionaries_(pset.getUntrackedParameter<bool>("LoadAllDictionaries")),
0778           autoLibraryLoader_(loadAllDictionaries_ or pset.getUntrackedParameter<bool>("AutoLibraryLoader")),
0779           autoClassParser_(pset.getUntrackedParameter<bool>("AutoClassParser")),
0780           interactiveDebug_(pset.getUntrackedParameter<bool>("InteractiveDebug")) {
0781       stackTracePause_ = pset.getUntrackedParameter<int>("StackTracePauseTime");
0782 
0783       if (not threadTracker_) {
0784         threadTracker_ = std::make_unique<ThreadTracker>();
0785         iReg.watchPostEndJob([]() {
0786           if (threadTracker_) {
0787             threadTracker_->observe(false);
0788           }
0789         });
0790       }
0791 
0792       if (unloadSigHandler_) {
0793         // Deactivate all the Root signal handlers and restore the system defaults
0794         gSystem->ResetSignal(kSigChild);
0795         gSystem->ResetSignal(kSigBus);
0796         gSystem->ResetSignal(kSigSegmentationViolation);
0797         gSystem->ResetSignal(kSigIllegalInstruction);
0798         gSystem->ResetSignal(kSigSystem);
0799         gSystem->ResetSignal(kSigPipe);
0800         gSystem->ResetSignal(kSigAlarm);
0801         gSystem->ResetSignal(kSigUrgent);
0802         gSystem->ResetSignal(kSigFloatingException);
0803         gSystem->ResetSignal(kSigWindowChanged);
0804       } else if (pset.getUntrackedParameter<bool>("AbortOnSignal")) {
0805         cachePidInfo();
0806 
0807         //NOTE: ROOT can also be told to abort on these kinds of problems BUT
0808         // it requires an TApplication to be instantiated which causes problems
0809         gSystem->ResetSignal(kSigBus);
0810         gSystem->ResetSignal(kSigSegmentationViolation);
0811         gSystem->ResetSignal(kSigIllegalInstruction);
0812         gSystem->ResetSignal(kSigFloatingException);
0813         installCustomHandler(SIGBUS, sig_dostack_then_abort);
0814         sigBusHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGBUS, sig_abort); });
0815         installCustomHandler(SIGSEGV, sig_dostack_then_abort);
0816         sigSegvHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGSEGV, sig_abort); });
0817         installCustomHandler(SIGILL, sig_dostack_then_abort);
0818         sigIllHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGILL, sig_abort); });
0819         installCustomHandler(SIGTERM, sig_dostack_then_abort);
0820         sigTermHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGTERM, sig_abort); });
0821         installCustomHandler(SIGFPE, sig_dostack_then_abort);
0822         sigFpeHandler_ = std::shared_ptr<const void>(nullptr, [](void*) { installCustomHandler(SIGFPE, sig_abort); });
0823         installCustomHandler(SIGABRT, sig_dostack_then_abort);
0824         sigAbrtHandler_ = std::shared_ptr<const void>(nullptr, [](void*) {
0825           signal(SIGABRT, SIG_DFL);  // release SIGABRT to default
0826         });
0827       }
0828 
0829       iReg.watchPreallocate([](edm::service::SystemBounds const& iBounds) {
0830         if (iBounds.maxNumberOfThreads() > moduleListBuffers_.size()) {
0831           moduleListBuffers_.resize(iBounds.maxNumberOfThreads());
0832         }
0833       });
0834 
0835       if (resetErrHandler_) {
0836         // Replace the Root error handler with one that uses the MessageLogger
0837         SetErrorHandler(RootErrorHandler);
0838       }
0839 
0840       // Enable automatic Root library loading.
0841       if (autoLibraryLoader_) {
0842         gInterpreter->SetClassAutoloading(1);
0843       }
0844 
0845       // Enable/disable automatic parsing of headers
0846       if (not autoClassParser_) {
0847         // Disable automatic parsing of headers during module construction
0848         iReg.watchPreModuleConstruction(
0849             [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(false); });
0850         iReg.watchPostModuleConstruction(
0851             [](edm::ModuleDescription const&) { gInterpreter->SetClassAutoparsing(true); });
0852       }
0853 
0854       // Set ROOT parameters.
0855       TTree::SetMaxTreeSize(kMaxLong64);
0856       TH1::AddDirectory(kFALSE);
0857       //G__SetCatchException(0);
0858 
0859       // Set custom streamers
0860       setRefCoreStreamerInTClass();
0861 
0862       // Load the library containing dictionaries for std:: classes, if not already loaded.
0863       if (!hasDictionary(typeid(std::vector<std::vector<unsigned int>>))) {
0864         TypeWithDict::byName("std::vector<std::vector<unsigned int> >");
0865       }
0866 
0867       int debugLevel = pset.getUntrackedParameter<int>("DebugLevel");
0868       if (debugLevel > 0) {
0869         gDebug = debugLevel;
0870       }
0871 
0872       // Enable Root implicit multi-threading
0873       bool imt = pset.getUntrackedParameter<bool>("EnableIMT");
0874       if (imt && not ROOT::IsImplicitMTEnabled()) {
0875         //cmsRun uses global_control to set the number of allowed threads to use
0876         // we need to tell ROOT the same value in order to avoid unnecessary warnings
0877         ROOT::EnableImplicitMT(
0878             oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism));
0879       }
0880     }
0881 
0882     InitRootHandlers::~InitRootHandlers() {
0883       // close all open ROOT files
0884       TIter iter(gROOT->GetListOfFiles());
0885       TObject* obj = nullptr;
0886       while (nullptr != (obj = iter.Next())) {
0887         TFile* f = dynamic_cast<TFile*>(obj);
0888         if (f) {
0889           // We get a new iterator each time,
0890           // because closing a file can invalidate the iterator
0891           f->Close();
0892           iter = TIter(gROOT->GetListOfFiles());
0893         }
0894       }
0895       //disengage from TBB to avoid possible at exit problems
0896       threadTracker_.reset();
0897     }
0898 
0899     void InitRootHandlers::willBeUsingThreads() {
0900       //Tell Root we want to be multi-threaded
0901       ROOT::EnableThreadSafety();
0902 
0903       //When threading, also have to keep ROOT from logging all TObjects into a list
0904       TObject::SetObjectStat(false);
0905 
0906       //Have to avoid having Streamers modify themselves after they have been used
0907       TVirtualStreamerInfo::Optimize(false);
0908     }
0909 
0910     void InitRootHandlers::fillDescriptions(ConfigurationDescriptions& descriptions) {
0911       ParameterSetDescription desc;
0912       desc.setComment("Centralized interface to ROOT.");
0913       desc.addUntracked<bool>("UnloadRootSigHandler", false)
0914           ->setComment("If True, signals are handled by this service, rather than by ROOT.");
0915       desc.addUntracked<bool>("ResetRootErrHandler", true)
0916           ->setComment(
0917               "If True, ROOT messages (e.g. errors, warnings) are handled by this service, rather than by ROOT.");
0918       desc.addUntracked<bool>("AutoLibraryLoader", true)
0919           ->setComment("If True, enables automatic loading of data dictionaries.");
0920       desc.addUntracked<bool>("AutoClassParser", true)
0921           ->setComment(
0922               "If False, the automatic parsing of class headers for dictionaries when pre-built dictionaries are "
0923               "missing is disable during module construction. The current implementation of disabling the parsing is "
0924               "fragile, and may work only in a single-thread job that does not use reco::parser::cutParser() or "
0925               "reco::parser::expressionParser() (and it certainly does not work on multiple threads).");
0926       desc.addUntracked<bool>("LoadAllDictionaries", false)->setComment("If True, loads all ROOT dictionaries.");
0927       desc.addUntracked<bool>("EnableIMT", true)->setComment("If True, calls ROOT::EnableImplicitMT().");
0928       desc.addUntracked<bool>("AbortOnSignal", true)
0929           ->setComment(
0930               "If True, do an abort when a signal occurs that causes a crash. If False, ROOT will do an exit which "
0931               "attempts to do a clean shutdown.");
0932       desc.addUntracked<bool>("InteractiveDebug", false)
0933           ->setComment(
0934               "If True, leave gdb attached to cmsRun after a crash; "
0935               "if False, attach gdb, print a stack trace, and quit gdb");
0936       desc.addUntracked<int>("DebugLevel", 0)->setComment("Sets ROOT's gDebug value.");
0937       desc.addUntracked<int>("StackTracePauseTime", 300)
0938           ->setComment("Seconds to pause other threads during stack trace.");
0939       descriptions.add("InitRootHandlers", desc);
0940     }
0941 
0942     char const* const* InitRootHandlers::getPstackArgv() { return pstackArgv_; }
0943 
0944     void InitRootHandlers::enableWarnings_() { s_ignoreWarnings = edm::RootHandlers::SeverityLevel::kInfo; }
0945 
0946     void InitRootHandlers::ignoreWarnings_(edm::RootHandlers::SeverityLevel level) { s_ignoreWarnings = level; }
0947 
0948     void InitRootHandlers::cachePidInfo() {
0949       if (helperThread_) {
0950         //Another InitRootHandlers was initialized in this job, possibly
0951         // because multiple EventProcessors are being used.
0952         //In that case, we are already all setup
0953         return;
0954       }
0955       std::string gdbcmd{"date; gdb -quiet -p %d"};
0956       if (!interactiveDebug_) {
0957         gdbcmd +=
0958             " 2>&1 <<EOF |\n"
0959             "set width 0\n"
0960             "set height 0\n"
0961             "set pagination no\n"
0962             "thread apply all bt\n"
0963             "EOF\n"
0964             "/bin/sed -n -e 's/^\\((gdb) \\)*//' -e '/^#/p' -e '/^Thread/p'";
0965       }
0966       if (snprintf(pidString_, pidStringLength_ - 1, gdbcmd.c_str(), getpid()) >= pidStringLength_) {
0967         std::ostringstream sstr;
0968         sstr << "Unable to pre-allocate stacktrace handler information";
0969         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0970         throw except;
0971       }
0972 
0973       // These are initialized to -1; harmless to close an invalid FD.
0974       // If this is called post-fork, we don't want to be communicating on
0975       // these FDs as they are used internally by the parent.
0976       close(childToParent_[0]);
0977       close(childToParent_[1]);
0978       childToParent_[0] = -1;
0979       childToParent_[1] = -1;
0980       close(parentToChild_[0]);
0981       close(parentToChild_[1]);
0982       parentToChild_[0] = -1;
0983       parentToChild_[1] = -1;
0984 
0985       if (-1 == pipe2(childToParent_, O_CLOEXEC)) {
0986         std::ostringstream sstr;
0987         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0988         edm::Exception except(edm::errors::OtherCMS, sstr.str());
0989         throw except;
0990       }
0991 
0992       if (-1 == pipe2(parentToChild_, O_CLOEXEC)) {
0993         close(childToParent_[0]);
0994         close(childToParent_[1]);
0995         childToParent_[0] = -1;
0996         childToParent_[1] = -1;
0997         std::ostringstream sstr;
0998         sstr << "Failed to create child-to-parent pipes (errno=" << errno << "): " << strerror(errno);
0999         edm::Exception except(edm::errors::OtherCMS, sstr.str());
1000         throw except;
1001       }
1002 
1003       helperThread_ = std::make_unique<std::thread>(stacktraceHelperThread);
1004       helperThread_->detach();
1005     }
1006 
1007   }  // end of namespace service
1008 }  // end of namespace edm
1009 
1010 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
1011 
1012 using edm::service::InitRootHandlers;
1013 typedef edm::serviceregistry::AllArgsMaker<edm::RootHandlers, InitRootHandlers> RootHandlersMaker;
1014 DEFINE_FWK_SERVICE_MAKER(InitRootHandlers, RootHandlersMaker);