File indexing completed on 2024-04-06 12:15:43
0001 #include <iostream>
0002
0003 #include <cuda.h>
0004
0005 #include "DataFormats/Provenance/interface/ModuleDescription.h"
0006 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0007 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0008 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0009 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0010 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
0011 #include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
0012 #include "FWCore/ServiceRegistry/interface/Service.h"
0013 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0014 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/deviceAllocatorStatus.h"
0017
0018 namespace edm {
0019 class StreamContext;
0020 }
0021
0022 class CUDAMonitoringService {
0023 public:
0024 CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
0025 ~CUDAMonitoringService() = default;
0026
0027 static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0028
0029 void postModuleConstruction(edm::ModuleDescription const& desc);
0030 void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
0031 void postModuleEvent(edm::StreamContext const& sc, edm::ModuleCallingContext const& mcc);
0032 void postEvent(edm::StreamContext const& sc);
0033
0034 private:
0035 int numberOfDevices_ = 0;
0036 };
0037
0038 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
0039
0040 edm::Service<CUDAInterface> cuda;
0041 if (not cuda or not cuda->enabled())
0042 return;
0043
0044 numberOfDevices_ = cuda->numberOfDevices();
0045
0046 if (config.getUntrackedParameter<bool>("memoryConstruction")) {
0047 registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
0048 }
0049 if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
0050 registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
0051 }
0052 if (config.getUntrackedParameter<bool>("memoryPerModule")) {
0053 registry.watchPostModuleEvent(this, &CUDAMonitoringService::postModuleEvent);
0054 }
0055 if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
0056 registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
0057 }
0058 }
0059
0060 void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0061 edm::ParameterSetDescription desc;
0062
0063 desc.addUntracked<bool>("memoryConstruction", false)
0064 ->setComment("Print memory information for each device after the construction of each module");
0065 desc.addUntracked<bool>("memoryBeginStream", true)
0066 ->setComment("Print memory information for each device after the beginStream() of each module");
0067 desc.addUntracked<bool>("memoryPerModule", true)
0068 ->setComment("Print memory information for each device after the event of each module");
0069 desc.addUntracked<bool>("memoryPerEvent", true)
0070 ->setComment("Print memory information for each device after each event");
0071
0072 descriptions.add("CUDAMonitoringService", desc);
0073 descriptions.setComment(
0074 "The memory information is the global state of the device. This gets confusing if there are multiple processes "
0075 "running on the same device. Probably the information retrieval should be re-thought?");
0076 }
0077
0078
0079 namespace {
0080 template <typename T>
0081 void dumpUsedMemory(T& log, int num) {
0082 auto const cachingDeviceAllocatorStatus = cms::cuda::deviceAllocatorStatus();
0083 int old = 0;
0084 cudaCheck(cudaGetDevice(&old));
0085 constexpr auto mbytes = 1 << 20;
0086 for (int i = 0; i < num; ++i) {
0087 size_t freeMemory, totalMemory;
0088 cudaCheck(cudaSetDevice(i));
0089 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0090 log << "\n"
0091 << i << ": " << (totalMemory - freeMemory) / mbytes << " MB used / " << totalMemory / mbytes << " MB total";
0092 auto found = cachingDeviceAllocatorStatus.find(i);
0093 if (found != cachingDeviceAllocatorStatus.end()) {
0094 auto const& cached = found->second;
0095 log << "; CachingDeviceAllocator " << cached.live / mbytes << " MB live "
0096 << "(" << cached.liveRequested / mbytes << " MB requested) " << cached.free / mbytes << " MB free "
0097 << (cached.live + cached.free) / mbytes << " MB total cached";
0098 }
0099 }
0100 cudaCheck(cudaSetDevice(old));
0101 }
0102 }
0103
0104 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
0105 auto log = edm::LogPrint("CUDAMonitoringService");
0106 log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
0107 dumpUsedMemory(log, numberOfDevices_);
0108 }
0109
0110 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0111 auto log = edm::LogPrint("CUDAMonitoringService");
0112 log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
0113 << mcc.moduleDescription()->moduleName() << ")";
0114 dumpUsedMemory(log, numberOfDevices_);
0115 }
0116
0117 void CUDAMonitoringService::postModuleEvent(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
0118 auto log = edm::LogPrint("CUDAMonitoringService");
0119 log << "CUDA device memory after processing an event by " << mcc.moduleDescription()->moduleLabel() << " ("
0120 << mcc.moduleDescription()->moduleName() << ")";
0121 dumpUsedMemory(log, numberOfDevices_);
0122 }
0123
0124 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
0125 auto log = edm::LogPrint("CUDAMonitoringService");
0126 log << "CUDA device memory after event";
0127 dumpUsedMemory(log, numberOfDevices_);
0128 }
0129
0130 DEFINE_FWK_SERVICE(CUDAMonitoringService);