Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:46

0001 #include <iostream>
0002 
0003 #include <hip/hip_runtime.h>
0004 
0005 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0006 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0007 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0008 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0010 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0011 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0012 #include "FWCore/ServiceRegistry/interface/Service.h"
0013 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0014 #include "HeterogeneousCore/ROCmServices/interface/ROCmInterface.h"
0015 #include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
0016 
0017 namespace edm {
0018   class StreamContext;
0019 }
0020 
0021 class ROCmMonitoringService {
0022 public:
0023   ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
0024   ~ROCmMonitoringService() = default;
0025 
0026   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0027 
0028   void postModuleConstruction(edm::ModuleDescription const& desc);
0029   void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
0030   void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
0031   void postEvent(edm::StreamContext const& sc);
0032 
0033 private:
0034   int numberOfDevices_ = 0;
0035 };
0036 
0037 ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
0038   // make sure that ROCm is initialised, and that the ROCmService destructor is called after this service's destructor
0039   edm::Service<ROCmInterface> service;
0040   if (not service or not service->enabled())
0041     return;
0042 
0043   numberOfDevices_ = service->numberOfDevices();
0044 
0045   if (config.getUntrackedParameter<bool>("memoryConstruction")) {
0046     registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction);
0047   }
0048   if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
0049     registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream);
0050   }
0051   if (config.getUntrackedParameter<bool>("memoryPerModule")) {
0052     registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent);
0053   }
0054   if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
0055     registry.watchPostEvent(this, &ROCmMonitoringService::postEvent);
0056   }
0057 }
0058 
0059 void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0060   edm::ParameterSetDescription desc;
0061 
0062   desc.addUntracked<bool>("memoryConstruction", false)
0063       ->setComment("Print memory information for each device after the construction of each module");
0064   desc.addUntracked<bool>("memoryBeginStream", true)
0065       ->setComment("Print memory information for each device after the beginStream() of each module");
0066   desc.addUntracked<bool>("memoryPerModule", true)
0067       ->setComment("Print memory information for each device after the event of each module");
0068   desc.addUntracked<bool>("memoryPerEvent", true)
0069       ->setComment("Print memory information for each device after each event");
0070 
0071   descriptions.add("ROCmMonitoringService", desc);
0072   descriptions.setComment(
0073       "The memory information is the global state of the device. This gets confusing if there are multiple processes "
0074       "running on the same device. Probably the information retrieval should be re-thought?");
0075 }
0076 
0077 // activity handlers
0078 namespace {
0079   template <typename T>
0080   void dumpUsedMemory(T& log, int num) {
0081     int old = 0;
0082     hipCheck(hipGetDevice(&old));
0083     constexpr auto mbytes = 1 << 20;
0084     for (int i = 0; i < num; ++i) {
0085       size_t freeMemory, totalMemory;
0086       hipCheck(hipSetDevice(i));
0087       hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
0088       log << "\n"
0089           << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
0090     }
0091     hipCheck(hipSetDevice(old));
0092   }
0093 }  // namespace
0094 
0095 void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
0096   auto log = edm::LogPrint("ROCmMonitoringService");
0097   log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
0098   dumpUsedMemory(log, numberOfDevices_);
0099 }
0100 
0101 void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0102   auto log = edm::LogPrint("ROCmMonitoringService");
0103   log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
0104       << mcc.moduleDescription()->moduleName() << ")";
0105   dumpUsedMemory(log, numberOfDevices_);
0106 }
0107 
0108 void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0109   auto log = edm::LogPrint("ROCmMonitoringService");
0110   log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
0111       << mcc.moduleDescription()->moduleName() << ")";
0112   dumpUsedMemory(log, numberOfDevices_);
0113 }
0114 
0115 void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) {
0116   auto log = edm::LogPrint("ROCmMonitoringService");
0117   log << "ROCm device memory after event";
0118   dumpUsedMemory(log, numberOfDevices_);
0119 }
0120 
0121 DEFINE_FWK_SERVICE(ROCmMonitoringService);