Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-02-14 12:49:25

0001 #include <iostream>
0002 
0003 #include <cuda.h>
0004 
0005 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0006 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0007 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0008 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0010 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0011 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0012 #include "FWCore/ServiceRegistry/interface/Service.h"
0013 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0014 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
0017 
0018 namespace edm {
0019   class StreamContext;
0020 }
0021 
0022 class CUDAMonitoringService {
0023 public:
0024   CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
0025   ~CUDAMonitoringService() = default;
0026 
0027   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0028 
0029   void postModuleConstruction(edm::ModuleDescription const& desc);
0030   void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
0031   void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
0032   void postEvent(edm::StreamContext const& sc);
0033 
0034 private:
0035   int numberOfDevices_ = 0;
0036 };
0037 
0038 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
0039   // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
0040   edm::Service<CUDAService> cudaService;
0041   if (!cudaService->enabled())
0042     return;
0043   numberOfDevices_ = cudaService->numberOfDevices();
0044 
0045   if (config.getUntrackedParameter<bool>("memoryConstruction")) {
0046     registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
0047   }
0048   if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
0049     registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
0050   }
0051   if (config.getUntrackedParameter<bool>("memoryPerModule")) {
0052     registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
0053   }
0054   if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
0055     registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
0056   }
0057 }
0058 
0059 void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0060   edm::ParameterSetDescription desc;
0061 
0062   desc.addUntracked<bool>("memoryConstruction", false)
0063       ->setComment("Print memory information for each device after the construction of each module");
0064   desc.addUntracked<bool>("memoryBeginStream", true)
0065       ->setComment("Print memory information for each device after the beginStream() of each module");
0066   desc.addUntracked<bool>("memoryPerModule", true)
0067       ->setComment("Print memory information for each device after the event of each module");
0068   desc.addUntracked<bool>("memoryPerEvent", true)
0069       ->setComment("Print memory information for each device after each event");
0070 
0071   descriptions.add("CUDAMonitoringService", desc);
0072   descriptions.setComment(
0073       "The memory information is the global state of the device. This gets confusing if there are multiple processes "
0074       "running on the same device. Probably the information retrieval should be re-thought?");
0075 }
0076 
0077 // activity handlers
0078 namespace {
0079   template <typename T>
0080   void dumpUsedMemory(T& log, int num) {
0081     auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
0082     int old = 0;
0083     cudaCheck(cudaGetDevice(&old));
0084     constexpr auto mbytes = 1 << 20;
0085     for (int i = 0; i < num; ++i) {
0086       size_t freeMemory, totalMemory;
0087       cudaCheck(cudaSetDevice(i));
0088       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0089       log << "\n"
0090           << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
0091       auto found = cachingDeviceAllocatorStatus.find(i);
0092       if (found != cachingDeviceAllocatorStatus.end()) {
0093         auto const& cached = found->second;
0094         log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
0095             << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
0096             << (cached.live + cached.free) / mbytes << " MB total cached";
0097       }
0098     }
0099     cudaCheck(cudaSetDevice(old));
0100   }
0101 }  // namespace
0102 
0103 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
0104   auto log = edm::LogPrint("CUDAMonitoringService");
0105   log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
0106   dumpUsedMemory(log, numberOfDevices_);
0107 }
0108 
0109 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0110   auto log = edm::LogPrint("CUDAMonitoringService");
0111   log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
0112       << mcc.moduleDescription()->moduleName() << ")";
0113   dumpUsedMemory(log, numberOfDevices_);
0114 }
0115 
0116 void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0117   auto log = edm::LogPrint("CUDAMonitoringService");
0118   log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
0119       << mcc.moduleDescription()->moduleName() << ")";
0120   dumpUsedMemory(log, numberOfDevices_);
0121 }
0122 
0123 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
0124   auto log = edm::LogPrint("CUDAMonitoringService");
0125   log << "CUDA device memory after event";
0126   dumpUsedMemory(log, numberOfDevices_);
0127 }
0128 
0129 DEFINE_FWK_SERVICE(CUDAMonitoringService);