File indexing completed on 2024-04-06 12:15:46
0001 #include <iostream>
0002
0003 #include <hip/hip_runtime.h>
0004
0005 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0006 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0007 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0008 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0010 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0011 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0012 #include "FWCore/ServiceRegistry/interface/Service.h"
0013 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0014 #include "HeterogeneousCore/ROCmServices/interface/ROCmInterface.h"
0015 #include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
0016
0017 namespace edm {
0018 class StreamContext;
0019 }
0020
0021 class ROCmMonitoringService {
0022 public:
0023 ROCmMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
0024 ~ROCmMonitoringService() = default;
0025
0026 static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0027
0028 void postModuleConstruction(edm::ModuleDescription const& desc);
0029 void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
0030 void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
0031 void postEvent(edm::StreamContext const& sc);
0032
0033 private:
0034 int numberOfDevices_ = 0;
0035 };
0036
0037 ROCmMonitoringService::ROCmMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
0038
0039 edm::Service<ROCmInterface> service;
0040 if (not service or not service->enabled())
0041 return;
0042
0043 numberOfDevices_ = service->numberOfDevices();
0044
0045 if (config.getUntrackedParameter<bool>("memoryConstruction")) {
0046 registry.watchPostModuleConstruction(this, &ROCmMonitoringService::postModuleConstruction);
0047 }
0048 if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
0049 registry.watchPostModuleBeginStream(this, &ROCmMonitoringService::postModuleBeginStream);
0050 }
0051 if (config.getUntrackedParameter<bool>("memoryPerModule")) {
0052 registry.watchPostModuleEvent(this, &ROCmMonitoringService::postModuleEvent);
0053 }
0054 if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
0055 registry.watchPostEvent(this, &ROCmMonitoringService::postEvent);
0056 }
0057 }
0058
0059 void ROCmMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0060 edm::ParameterSetDescription desc;
0061
0062 desc.addUntracked<bool>("memoryConstruction", false)
0063 ->setComment("Print memory information for each device after the construction of each module");
0064 desc.addUntracked<bool>("memoryBeginStream", true)
0065 ->setComment("Print memory information for each device after the beginStream() of each module");
0066 desc.addUntracked<bool>("memoryPerModule", true)
0067 ->setComment("Print memory information for each device after the event of each module");
0068 desc.addUntracked<bool>("memoryPerEvent", true)
0069 ->setComment("Print memory information for each device after each event");
0070
0071 descriptions.add("ROCmMonitoringService", desc);
0072 descriptions.setComment(
0073 "The memory information is the global state of the device. This gets confusing if there are multiple processes "
0074 "running on the same device. Probably the information retrieval should be re-thought?");
0075 }
0076
0077
0078 namespace {
0079 template <typename T>
0080 void dumpUsedMemory(T& log, int num) {
0081 int old = 0;
0082 hipCheck(hipGetDevice(&old));
0083 constexpr auto mbytes = 1 << 20;
0084 for (int i = 0; i < num; ++i) {
0085 size_t freeMemory, totalMemory;
0086 hipCheck(hipSetDevice(i));
0087 hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
0088 log << "\n"
0089 << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
0090 }
0091 hipCheck(hipSetDevice(old));
0092 }
0093 }
0094
0095 void ROCmMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
0096 auto log = edm::LogPrint("ROCmMonitoringService");
0097 log << "ROCm device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
0098 dumpUsedMemory(log, numberOfDevices_);
0099 }
0100
0101 void ROCmMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0102 auto log = edm::LogPrint("ROCmMonitoringService");
0103 log << "ROCm device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
0104 << mcc.moduleDescription()->moduleName() << ")";
0105 dumpUsedMemory(log, numberOfDevices_);
0106 }
0107
0108 void ROCmMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0109 auto log = edm::LogPrint("ROCmMonitoringService");
0110 log << "ROCm device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
0111 << mcc.moduleDescription()->moduleName() << ")";
0112 dumpUsedMemory(log, numberOfDevices_);
0113 }
0114
0115 void ROCmMonitoringService::postEvent(edm::StreamContext const& sc) {
0116 auto log = edm::LogPrint("ROCmMonitoringService");
0117 log << "ROCm device memory after event";
0118 dumpUsedMemory(log, numberOfDevices_);
0119 }
0120
0121 DEFINE_FWK_SERVICE(ROCmMonitoringService);