Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:15:43

0001 #include <iostream>
0002 
0003 #include <cuda.h>
0004 
0005 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0006 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0007 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0008 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0010 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0011 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0012 #include "FWCore/ServiceRegistry/interface/Service.h"
0013 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0014 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
0017 
0018 namespace edm {
0019   class StreamContext;
0020 }
0021 
0022 class CUDAMonitoringService {
0023 public:
0024   CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
0025   ~CUDAMonitoringService() = default;
0026 
0027   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0028 
0029   void postModuleConstruction(edm::ModuleDescription const& desc);
0030   void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
0031   void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
0032   void postEvent(edm::StreamContext const& sc);
0033 
0034 private:
0035   int numberOfDevices_ = 0;
0036 };
0037 
0038 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
0039   // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
0040   edm::Service<CUDAInterface> cuda;
0041   if (not cuda or not cuda->enabled())
0042     return;
0043 
0044   numberOfDevices_ = cuda->numberOfDevices();
0045 
0046   if (config.getUntrackedParameter<bool>("memoryConstruction")) {
0047     registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
0048   }
0049   if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
0050     registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
0051   }
0052   if (config.getUntrackedParameter<bool>("memoryPerModule")) {
0053     registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
0054   }
0055   if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
0056     registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
0057   }
0058 }
0059 
0060 void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0061   edm::ParameterSetDescription desc;
0062 
0063   desc.addUntracked<bool>("memoryConstruction", false)
0064       ->setComment("Print memory information for each device after the construction of each module");
0065   desc.addUntracked<bool>("memoryBeginStream", true)
0066       ->setComment("Print memory information for each device after the beginStream() of each module");
0067   desc.addUntracked<bool>("memoryPerModule", true)
0068       ->setComment("Print memory information for each device after the event of each module");
0069   desc.addUntracked<bool>("memoryPerEvent", true)
0070       ->setComment("Print memory information for each device after each event");
0071 
0072   descriptions.add("CUDAMonitoringService", desc);
0073   descriptions.setComment(
0074       "The memory information is the global state of the device. This gets confusing if there are multiple processes "
0075       "running on the same device. Probably the information retrieval should be re-thought?");
0076 }
0077 
0078 // activity handlers
0079 namespace {
0080   template <typename T>
0081   void dumpUsedMemory(T& log, int num) {
0082     auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
0083     int old = 0;
0084     cudaCheck(cudaGetDevice(&old));
0085     constexpr auto mbytes = 1 << 20;
0086     for (int i = 0; i < num; ++i) {
0087       size_t freeMemory, totalMemory;
0088       cudaCheck(cudaSetDevice(i));
0089       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0090       log << "\n"
0091           << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
0092       auto found = cachingDeviceAllocatorStatus.find(i);
0093       if (found != cachingDeviceAllocatorStatus.end()) {
0094         auto const& cached = found->second;
0095         log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
0096             << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
0097             << (cached.live + cached.free) / mbytes << " MB total cached";
0098       }
0099     }
0100     cudaCheck(cudaSetDevice(old));
0101   }
0102 }  // namespace
0103 
0104 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
0105   auto log = edm::LogPrint("CUDAMonitoringService");
0106   log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
0107   dumpUsedMemory(log, numberOfDevices_);
0108 }
0109 
0110 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0111   auto log = edm::LogPrint("CUDAMonitoringService");
0112   log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
0113       << mcc.moduleDescription()->moduleName() << ")";
0114   dumpUsedMemory(log, numberOfDevices_);
0115 }
0116 
0117 void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0118   auto log = edm::LogPrint("CUDAMonitoringService");
0119   log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
0120       << mcc.moduleDescription()->moduleName() << ")";
0121   dumpUsedMemory(log, numberOfDevices_);
0122 }
0123 
0124 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
0125   auto log = edm::LogPrint("CUDAMonitoringService");
0126   log << "CUDA device memory after event";
0127   dumpUsedMemory(log, numberOfDevices_);
0128 }
0129 
0130 DEFINE_FWK_SERVICE(CUDAMonitoringService);