Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2025-02-12 04:02:30

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <stdexcept>
0006 #include <string>
0007 #include <utility>
0008 #include <vector>
0009 
0010 #include <cuda.h>
0011 #include <cuda_runtime.h>
0012 #include <nvml.h>
0013 
0014 #include "FWCore/AbstractServices/interface/ResourceInformation.h"
0015 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0016 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0017 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0018 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0019 #include "FWCore/ServiceRegistry/interface/Service.h"
0020 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0021 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0030 
0031 class CUDAService : public CUDAInterface {
0032 public:
0033   CUDAService(edm::ParameterSet const& config);
0034   ~CUDAService() override;
0035 
0036   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0037 
0038   bool enabled() const final { return enabled_; }
0039 
0040   int numberOfDevices() const final { return numberOfDevices_; }
0041 
0042   // Return the (major, minor) CUDA compute capability of the given device.
0043   std::pair<int, int> computeCapability(int device) const final {
0044     int size = computeCapabilities_.size();
0045     if (device < 0 or device >= size) {
0046       throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0047                               std::to_string(size - 1));
0048     }
0049     return computeCapabilities_[device];
0050   }
0051 
0052 private:
0053   int numberOfDevices_ = 0;
0054   std::vector<std::pair<int, int>> computeCapabilities_;
0055   bool enabled_ = false;
0056   bool verbose_ = false;
0057 };
0058 
0059 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0060   // read the current device
0061   int device;
0062   cudaCheck(cudaGetDevice(&device));
0063   // try to set the requested limit
0064   auto result = cudaDeviceSetLimit(limit, request);
0065   if (cudaErrorUnsupportedLimit == result) {
0066     edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0067     return;
0068   }
0069   // read back the limit value
0070   size_t value;
0071   result = cudaDeviceGetLimit(&value, limit);
0072   if (cudaSuccess != result) {
0073     edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0074                                    << request << ", current value is " << value;
0075   } else if (value != request) {
0076     edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0077                                    << " instead of requested " << request;
0078   }
0079 }
0080 
0081 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0082   switch (major * 10 + minor) {
0083     // Fermi architecture
0084     case 20:  // SM 2.0: GF100 class
0085       return 32;
0086     case 21:  // SM 2.1: GF10x class
0087       return 48;
0088 
0089     // Kepler architecture
0090     case 30:  // SM 3.0: GK10x class
0091     case 32:  // SM 3.2: GK10x class
0092     case 35:  // SM 3.5: GK11x class
0093     case 37:  // SM 3.7: GK21x class
0094       return 192;
0095 
0096     // Maxwell architecture
0097     case 50:  // SM 5.0: GM10x class
0098     case 52:  // SM 5.2: GM20x class
0099     case 53:  // SM 5.3: GM20x class
0100       return 128;
0101 
0102     // Pascal architecture
0103     case 60:  // SM 6.0: GP100 class
0104       return 64;
0105     case 61:  // SM 6.1: GP10x class
0106     case 62:  // SM 6.2: GP10x class
0107       return 128;
0108 
0109     // Volta architecture
0110     case 70:  // SM 7.0: GV100 class
0111     case 72:  // SM 7.2: GV11b class
0112       return 64;
0113 
0114     // Turing architecture
0115     case 75:  // SM 7.5: TU10x class
0116       return 64;
0117 
0118     // Ampere architecture
0119     case 80:  // SM 8.0: GA100 class
0120       return 64;
0121     case 86:  // SM 8.6: GA10x class
0122       return 128;
0123 
0124     // Ada Lovelace architectures
0125     case 89:  // SM 8.9: AD10x class
0126       return 128;
0127 
0128     // Hopper architecture
0129     case 90:  // SM 9.0: GH100 class
0130       return 128;
0131 
0132     // unknown architecture, return a default value
0133     default:
0134       return 64;
0135   }
0136 }
0137 
0138 std::string decodeVersion(int version) {
0139   return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0140 }
0141 
0142 namespace {
0143   template <template <typename> typename UniquePtr, typename Allocate>
0144   void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0145     if (bufferSizes.empty())
0146       return;
0147 
0148     auto streamPtr = cms::cuda::getStreamCache().get();
0149 
0150     std::vector<UniquePtr<char[]>> buffers;
0151     buffers.reserve(bufferSizes.size());
0152     for (auto size : bufferSizes) {
0153       buffers.push_back(allocate(size, streamPtr.get()));
0154     }
0155   }
0156 
0157   void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0158     int device;
0159     cudaCheck(cudaGetDevice(&device));
0160     for (int i = 0; i < numberOfDevices; ++i) {
0161       cudaCheck(cudaSetDevice(i));
0162       preallocate<cms::cuda::device::unique_ptr>(
0163           [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0164           bufferSizes);
0165     }
0166     cudaCheck(cudaSetDevice(device));
0167   }
0168 
0169   void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0170     preallocate<cms::cuda::host::unique_ptr>(
0171         [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0172         bufferSizes);
0173   }
0174 }  // namespace
0175 
0176 /// Constructor
0177 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0178   if (not config.getUntrackedParameter<bool>("enabled")) {
0179     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0180     return;
0181   }
0182 
0183   auto status = cudaGetDeviceCount(&numberOfDevices_);
0184   if (cudaSuccess != status) {
0185     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0186                                    << "Disabling the CUDAService.";
0187     return;
0188   }
0189   computeCapabilities_.reserve(numberOfDevices_);
0190 
0191   // NVIDIA system driver version, e.g. 470.57.02
0192   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0193   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0194   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0195   nvmlCheck(nvmlShutdown());
0196 
0197   // CUDA driver version, e.g. 11.4
0198   // the full version, like 11.4.1 or 11.4.100, is not reported
0199   int driverVersion = 0;
0200   cudaCheck(cudaDriverGetVersion(&driverVersion));
0201 
0202   // CUDA runtime version, e.g. 11.4
0203   // the full version, like 11.4.1 or 11.4.108, is not reported
0204   int runtimeVersion = 0;
0205   cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0206 
0207   edm::LogInfo log("CUDAService");
0208   if (verbose_) {
0209     log << "NVIDIA driver:    " << systemDriverVersion << '\n';
0210     log << "CUDA driver API:  " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0211         << ")\n";
0212     log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0213         << ")\n";
0214     log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0215   } else {
0216     log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0217         << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0218   }
0219 
0220   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0221   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0222   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0223   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0224   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0225 
0226   std::set<std::string> models;
0227 
0228   for (int i = 0; i < numberOfDevices_; ++i) {
0229     // read information about the compute device.
0230     // see the documentation of cudaGetDeviceProperties() for more information.
0231     cudaDeviceProp properties;
0232     cudaCheck(cudaGetDeviceProperties(&properties, i));
0233     log << '\n' << "CUDA device " << i << ": " << properties.name;
0234     if (verbose_) {
0235       log << '\n';
0236     }
0237     models.insert(std::string(properties.name));
0238 
0239     // compute capabilities
0240     computeCapabilities_.emplace_back(properties.major, properties.minor);
0241     if (verbose_) {
0242       log << "  compute capability:          " << properties.major << "." << properties.minor;
0243     }
0244     log << " (sm_" << properties.major << properties.minor << ")";
0245     if (verbose_) {
0246       log << '\n';
0247       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0248       log << "  CUDA cores: " << std::setw(28)
0249           << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0250       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0251           << ":1\n";
0252     }
0253 
0254     // compute mode
0255     static constexpr const char* computeModeDescription[] = {
0256         "default (shared)",            // cudaComputeModeDefault
0257         "exclusive (single thread)",   // cudaComputeModeExclusive
0258         "prohibited",                  // cudaComputeModeProhibited
0259         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
0260         "unknown"};
0261     if (verbose_) {
0262       log << "  compute mode:" << std::right << std::setw(27)
0263           << computeModeDescription[std::min(properties.computeMode,
0264                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0265           << '\n';
0266     }
0267 
0268     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
0269     cudaCheck(cudaSetDevice(i));
0270     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0271 
0272     // read the free and total amount of memory available for allocation by the device, in bytes.
0273     // see the documentation of cudaMemGetInfo() for more information.
0274     if (verbose_) {
0275       size_t freeMemory, totalMemory;
0276       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0277       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0278           << totalMemory / (1 << 20) << " MB total\n";
0279       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0280       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0281     }
0282 
0283     // L1 cache behaviour
0284     if (verbose_) {
0285       static constexpr const char* l1CacheModeDescription[] = {
0286           "unknown", "local memory", "global memory", "local and global memory"};
0287       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0288       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0289       log << '\n';
0290 
0291       log << "Other capabilities\n";
0292       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0293           << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0294       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0295           << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0296       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0297           << " access pageable memory via the host's page tables\n";
0298       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0299           << " access host registered memory at the same virtual address as the host\n";
0300       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
0301           << " a unified address space with the host\n";
0302       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0303           << " allocating managed memory on this system\n";
0304       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0305           << " coherently access managed memory concurrently with the host\n";
0306       log << "  "
0307           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0308           << " directly access managed memory on the device without migration\n";
0309       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0310           << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0311       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0312           << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0313       log << '\n';
0314     }
0315 
0316     // set and read the CUDA device flags.
0317     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
0318     if (verbose_) {
0319       log << "CUDA flags\n";
0320       unsigned int flags;
0321       cudaCheck(cudaGetDeviceFlags(&flags));
0322       switch (flags & cudaDeviceScheduleMask) {
0323         case cudaDeviceScheduleAuto:
0324           log << "  thread policy:                   default\n";
0325           break;
0326         case cudaDeviceScheduleSpin:
0327           log << "  thread policy:                      spin\n";
0328           break;
0329         case cudaDeviceScheduleYield:
0330           log << "  thread policy:                     yield\n";
0331           break;
0332         case cudaDeviceScheduleBlockingSync:
0333           log << "  thread policy:             blocking sync\n";
0334           break;
0335         default:
0336           log << "  thread policy:                 undefined\n";
0337       }
0338       if (flags & cudaDeviceMapHost) {
0339         log << "  pinned host memory allocations:  enabled\n";
0340       } else {
0341         log << "  pinned host memory allocations: disabled\n";
0342       }
0343       if (flags & cudaDeviceLmemResizeToMax) {
0344         log << "  kernel host memory reuse:        enabled\n";
0345       } else {
0346         log << "  kernel host memory reuse:       disabled\n";
0347       }
0348       log << '\n';
0349     }
0350 
0351     // set and read the CUDA resource limits.
0352     // see the documentation of cudaDeviceSetLimit() for more information.
0353 
0354     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
0355     // printf() device system call.
0356     if (printfFifoSize >= 0) {
0357       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0358     }
0359     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
0360     if (stackSize >= 0) {
0361       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0362     }
0363     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0364     // and free() device system calls.
0365     if (mallocHeapSize >= 0) {
0366       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0367     }
0368     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0369       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
0370       // device runtime launches that can be made from the current device.
0371       if (devRuntimePendingLaunchCount >= 0) {
0372         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0373                      "cudaLimitDevRuntimePendingLaunchCount",
0374                      devRuntimePendingLaunchCount);
0375       }
0376     }
0377 
0378     if (verbose_) {
0379       size_t value;
0380       log << "CUDA limits\n";
0381       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0382       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
0383       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0384       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0385       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0386       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0387       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0388         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0389         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
0390       }
0391     }
0392   }
0393 
0394   edm::Service<edm::ResourceInformation> resourceInformationService;
0395   if (resourceInformationService.isAvailable()) {
0396     std::vector<std::string> modelsV(models.begin(), models.end());
0397     resourceInformationService->setGPUModels(modelsV);
0398     std::string nvidiaDriverVersion{systemDriverVersion};
0399     resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0400     resourceInformationService->setCudaDriverVersion(driverVersion);
0401     resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0402   }
0403 
0404   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
0405   if constexpr (cms::cuda::allocator::useCaching) {
0406     cms::cuda::allocator::cachingAllocatorsConstruct();
0407   }
0408   cms::cuda::getEventCache().clear();
0409   cms::cuda::getStreamCache().clear();
0410 
0411   if (verbose_) {
0412     log << '\n' << "CUDAService fully initialized";
0413   }
0414   enabled_ = true;
0415 
0416   // Preallocate buffers if asked to
0417   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0418   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
0419   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
0420 }
0421 
0422 CUDAService::~CUDAService() {
0423   if (enabled_) {
0424     // Explicitly destruct the allocator before the device resets below
0425     if constexpr (cms::cuda::allocator::useCaching) {
0426       cms::cuda::allocator::cachingAllocatorsFreeCached();
0427     }
0428     cms::cuda::getEventCache().clear();
0429     cms::cuda::getStreamCache().clear();
0430 
0431     for (int i = 0; i < numberOfDevices_; ++i) {
0432       cudaCheck(cudaSetDevice(i));
0433       cudaCheck(cudaDeviceSynchronize());
0434       // Explicitly destroys and cleans up all resources associated with the current device in the
0435       // current process. Any subsequent API call to this device will reinitialize the device.
0436       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
0437       cudaDeviceReset();
0438     }
0439   }
0440 }
0441 
0442 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0443   edm::ParameterSetDescription desc;
0444   desc.addUntracked<bool>("enabled", true);
0445   desc.addUntracked<bool>("verbose", false);
0446 
0447   edm::ParameterSetDescription limits;
0448   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0449       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0450   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0451   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0452       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0453   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0454       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0455   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0456       ->setComment(
0457           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0458           "the default value.");
0459 
0460   edm::ParameterSetDescription allocator;
0461   allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
0462       ->setComment("Preallocates buffers of given bytes on all devices");
0463   allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
0464       ->setComment("Preallocates buffers of given bytes on the host");
0465   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0466 
0467   descriptions.add("CUDAService", desc);
0468 }
0469 
0470 namespace edm {
0471   namespace service {
0472     inline bool isProcessWideService(CUDAService const*) { return true; }
0473   }  // namespace service
0474 }  // namespace edm
0475 
0476 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0477 using CUDAServiceMaker = edm::serviceregistry::ParameterSetMaker<CUDAInterface, CUDAService>;
0478 DEFINE_FWK_SERVICE_MAKER(CUDAService, CUDAServiceMaker);