Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-12-19 23:38:32

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <string>
0006 #include <vector>
0007 
0008 #include <cuda.h>
0009 #include <cuda_runtime.h>
0010 #include <nvml.h>
0011 
0012 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0013 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0014 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0015 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0016 #include "FWCore/ServiceRegistry/interface/Service.h"
0017 #include "FWCore/Utilities/interface/ResourceInformation.h"
0018 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0019 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0021 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0028 
0029 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0030   // read the current device
0031   int device;
0032   cudaCheck(cudaGetDevice(&device));
0033   // try to set the requested limit
0034   auto result = cudaDeviceSetLimit(limit, request);
0035   if (cudaErrorUnsupportedLimit == result) {
0036     edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0037     return;
0038   }
0039   // read back the limit value
0040   size_t value;
0041   result = cudaDeviceGetLimit(&value, limit);
0042   if (cudaSuccess != result) {
0043     edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0044                                    << request << ", current value is " << value;
0045   } else if (value != request) {
0046     edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0047                                    << " instead of requested " << request;
0048   }
0049 }
0050 
0051 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0052   switch (major * 10 + minor) {
0053     // Fermi architecture
0054     case 20:  // SM 2.0: GF100 class
0055       return 32;
0056     case 21:  // SM 2.1: GF10x class
0057       return 48;
0058 
0059     // Kepler architecture
0060     case 30:  // SM 3.0: GK10x class
0061     case 32:  // SM 3.2: GK10x class
0062     case 35:  // SM 3.5: GK11x class
0063     case 37:  // SM 3.7: GK21x class
0064       return 192;
0065 
0066     // Maxwell architecture
0067     case 50:  // SM 5.0: GM10x class
0068     case 52:  // SM 5.2: GM20x class
0069     case 53:  // SM 5.3: GM20x class
0070       return 128;
0071 
0072     // Pascal architecture
0073     case 60:  // SM 6.0: GP100 class
0074       return 64;
0075     case 61:  // SM 6.1: GP10x class
0076     case 62:  // SM 6.2: GP10x class
0077       return 128;
0078 
0079     // Volta architecture
0080     case 70:  // SM 7.0: GV100 class
0081     case 72:  // SM 7.2: GV11b class
0082       return 64;
0083 
0084     // Turing architecture
0085     case 75:  // SM 7.5: TU10x class
0086       return 64;
0087 
0088     // Ampere architecture
0089     case 80:  // SM 8.0: GA100 class
0090       return 64;
0091     case 86:  // SM 8.6: GA10x class
0092       return 128;
0093 
0094     // unknown architecture, return a default value
0095     default:
0096       return 64;
0097   }
0098 }
0099 
0100 std::string decodeVersion(int version) {
0101   return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0102 }
0103 
0104 namespace {
0105   template <template <typename> typename UniquePtr, typename Allocate>
0106   void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0107     if (bufferSizes.empty())
0108       return;
0109 
0110     auto streamPtr = cms::cuda::getStreamCache().get();
0111 
0112     std::vector<UniquePtr<char[]> > buffers;
0113     buffers.reserve(bufferSizes.size());
0114     for (auto size : bufferSizes) {
0115       buffers.push_back(allocate(size, streamPtr.get()));
0116     }
0117   }
0118 
0119   void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0120     int device;
0121     cudaCheck(cudaGetDevice(&device));
0122     for (int i = 0; i < numberOfDevices; ++i) {
0123       cudaCheck(cudaSetDevice(i));
0124       preallocate<cms::cuda::device::unique_ptr>(
0125           [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0126           bufferSizes);
0127     }
0128     cudaCheck(cudaSetDevice(device));
0129   }
0130 
0131   void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0132     preallocate<cms::cuda::host::unique_ptr>(
0133         [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0134         bufferSizes);
0135   }
0136 }  // namespace
0137 
0138 /// Constructor
0139 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0140   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
0141   if (not configEnabled) {
0142     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0143     return;
0144   }
0145 
0146   auto status = cudaGetDeviceCount(&numberOfDevices_);
0147   if (cudaSuccess != status) {
0148     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0149                                    << "Disabling the CUDAService.";
0150     return;
0151   }
0152   computeCapabilities_.reserve(numberOfDevices_);
0153 
0154   // NVIDIA system driver version, e.g. 470.57.02
0155   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0156   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0157   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0158   nvmlCheck(nvmlShutdown());
0159 
0160   // CUDA driver version, e.g. 11.4
0161   // the full version, like 11.4.1 or 11.4.100, is not reported
0162   int driverVersion = 0;
0163   cudaCheck(cudaDriverGetVersion(&driverVersion));
0164 
0165   // CUDA runtime version, e.g. 11.4
0166   // the full version, like 11.4.1 or 11.4.108, is not reported
0167   int runtimeVersion = 0;
0168   cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0169 
0170   edm::LogInfo log("CUDAService");
0171   if (verbose_) {
0172     log << "NVIDIA driver:    " << systemDriverVersion << '\n';
0173     log << "CUDA driver API:  " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0174         << ")\n";
0175     log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0176         << ")\n";
0177     log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0178   } else {
0179     log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0180         << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0181   }
0182 
0183   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0184   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0185   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0186   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0187   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
0188   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0189 
0190   std::set<std::string> models;
0191 
0192   for (int i = 0; i < numberOfDevices_; ++i) {
0193     // read information about the compute device.
0194     // see the documentation of cudaGetDeviceProperties() for more information.
0195     cudaDeviceProp properties;
0196     cudaCheck(cudaGetDeviceProperties(&properties, i));
0197     log << '\n' << "CUDA device " << i << ": " << properties.name;
0198     if (verbose_) {
0199       log << '\n';
0200     }
0201     models.insert(std::string(properties.name));
0202 
0203     // compute capabilities
0204     computeCapabilities_.emplace_back(properties.major, properties.minor);
0205     if (verbose_) {
0206       log << "  compute capability:          " << properties.major << "." << properties.minor;
0207     }
0208     log << " (sm_" << properties.major << properties.minor << ")";
0209     if (verbose_) {
0210       log << '\n';
0211       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0212       log << "  CUDA cores: " << std::setw(28)
0213           << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0214       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0215           << ":1\n";
0216     }
0217 
0218     // compute mode
0219     static constexpr const char* computeModeDescription[] = {
0220         "default (shared)",            // cudaComputeModeDefault
0221         "exclusive (single thread)",   // cudaComputeModeExclusive
0222         "prohibited",                  // cudaComputeModeProhibited
0223         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
0224         "unknown"};
0225     if (verbose_) {
0226       log << "  compute mode:" << std::right << std::setw(27)
0227           << computeModeDescription[std::min(properties.computeMode,
0228                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0229           << '\n';
0230     }
0231 
0232     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
0233     cudaCheck(cudaSetDevice(i));
0234     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0235 
0236     // read the free and total amount of memory available for allocation by the device, in bytes.
0237     // see the documentation of cudaMemGetInfo() for more information.
0238     if (verbose_) {
0239       size_t freeMemory, totalMemory;
0240       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0241       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0242           << totalMemory / (1 << 20) << " MB total\n";
0243       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0244       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0245     }
0246 
0247     // L1 cache behaviour
0248     if (verbose_) {
0249       static constexpr const char* l1CacheModeDescription[] = {
0250           "unknown", "local memory", "global memory", "local and global memory"};
0251       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0252       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0253       log << '\n';
0254 
0255       log << "Other capabilities\n";
0256       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0257           << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0258       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0259           << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0260       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0261           << " access pageable memory via the host's page tables\n";
0262       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0263           << " access host registered memory at the same virtual address as the host\n";
0264       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
0265           << " a unified address space with the host\n";
0266       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0267           << " allocating managed memory on this system\n";
0268       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0269           << " coherently access managed memory concurrently with the host\n";
0270       log << "  "
0271           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0272           << " directly access managed memory on the device without migration\n";
0273       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0274           << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0275       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0276           << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0277       log << '\n';
0278     }
0279 
0280     // set and read the CUDA device flags.
0281     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
0282     if (verbose_) {
0283       log << "CUDA flags\n";
0284       unsigned int flags;
0285       cudaCheck(cudaGetDeviceFlags(&flags));
0286       switch (flags & cudaDeviceScheduleMask) {
0287         case cudaDeviceScheduleAuto:
0288           log << "  thread policy:                   default\n";
0289           break;
0290         case cudaDeviceScheduleSpin:
0291           log << "  thread policy:                      spin\n";
0292           break;
0293         case cudaDeviceScheduleYield:
0294           log << "  thread policy:                     yield\n";
0295           break;
0296         case cudaDeviceScheduleBlockingSync:
0297           log << "  thread policy:             blocking sync\n";
0298           break;
0299         default:
0300           log << "  thread policy:                 undefined\n";
0301       }
0302       if (flags & cudaDeviceMapHost) {
0303         log << "  pinned host memory allocations:  enabled\n";
0304       } else {
0305         log << "  pinned host memory allocations: disabled\n";
0306       }
0307       if (flags & cudaDeviceLmemResizeToMax) {
0308         log << "  kernel host memory reuse:        enabled\n";
0309       } else {
0310         log << "  kernel host memory reuse:       disabled\n";
0311       }
0312       log << '\n';
0313     }
0314 
0315     // set and read the CUDA resource limits.
0316     // see the documentation of cudaDeviceSetLimit() for more information.
0317 
0318     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
0319     // printf() device system call.
0320     if (printfFifoSize >= 0) {
0321       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0322     }
0323     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
0324     if (stackSize >= 0) {
0325       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0326     }
0327     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0328     // and free() device system calls.
0329     if (mallocHeapSize >= 0) {
0330       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0331     }
0332     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0333       // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
0334       // a thread can safely call cudaDeviceSynchronize().
0335       if (devRuntimeSyncDepth >= 0) {
0336         setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0337       }
0338       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
0339       // device runtime launches that can be made from the current device.
0340       if (devRuntimePendingLaunchCount >= 0) {
0341         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0342                      "cudaLimitDevRuntimePendingLaunchCount",
0343                      devRuntimePendingLaunchCount);
0344       }
0345     }
0346 
0347     if (verbose_) {
0348       size_t value;
0349       log << "CUDA limits\n";
0350       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0351       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
0352       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0353       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0354       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0355       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0356       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0357         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
0358         log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
0359         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0360         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
0361       }
0362     }
0363   }
0364 
0365   edm::Service<edm::ResourceInformation> resourceInformationService;
0366   if (resourceInformationService.isAvailable()) {
0367     std::vector<std::string> modelsV(models.begin(), models.end());
0368     resourceInformationService->setGPUModels(modelsV);
0369     std::string nvidiaDriverVersion{systemDriverVersion};
0370     resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0371     resourceInformationService->setCudaDriverVersion(driverVersion);
0372     resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0373   }
0374 
0375   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
0376   if constexpr (cms::cuda::allocator::useCaching) {
0377     cms::cuda::allocator::cachingAllocatorsConstruct();
0378   }
0379   cms::cuda::getEventCache().clear();
0380   cms::cuda::getStreamCache().clear();
0381 
0382   if (verbose_) {
0383     log << '\n' << "CUDAService fully initialized";
0384   }
0385   enabled_ = true;
0386 
0387   // Preallocate buffers if asked to
0388   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0389   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
0390   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
0391 }
0392 
0393 CUDAService::~CUDAService() {
0394   if (enabled_) {
0395     // Explicitly destruct the allocator before the device resets below
0396     if constexpr (cms::cuda::allocator::useCaching) {
0397       cms::cuda::allocator::cachingAllocatorsFreeCached();
0398     }
0399     cms::cuda::getEventCache().clear();
0400     cms::cuda::getStreamCache().clear();
0401 
0402     for (int i = 0; i < numberOfDevices_; ++i) {
0403       cudaCheck(cudaSetDevice(i));
0404       cudaCheck(cudaDeviceSynchronize());
0405       // Explicitly destroys and cleans up all resources associated with the current device in the
0406       // current process. Any subsequent API call to this device will reinitialize the device.
0407       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
0408       cudaDeviceReset();
0409     }
0410   }
0411 }
0412 
0413 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0414   edm::ParameterSetDescription desc;
0415   desc.addUntracked<bool>("enabled", true);
0416   desc.addUntracked<bool>("verbose", false);
0417 
0418   edm::ParameterSetDescription limits;
0419   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0420       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0421   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0422   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0423       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0424   limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
0425       ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
0426   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0427       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0428   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0429       ->setComment(
0430           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0431           "the default value.");
0432 
0433   edm::ParameterSetDescription allocator;
0434   allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
0435       ->setComment("Preallocates buffers of given bytes on all devices");
0436   allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
0437       ->setComment("Preallocates buffers of given bytes on the host");
0438   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0439 
0440   descriptions.add("CUDAService", desc);
0441 }
0442 
0443 int CUDAService::deviceWithMostFreeMemory() const {
0444   // save the current device
0445   int currentDevice;
0446   cudaCheck(cudaGetDevice(&currentDevice));
0447 
0448   size_t maxFreeMemory = 0;
0449   int device = -1;
0450   for (int i = 0; i < numberOfDevices_; ++i) {
0451     size_t freeMemory, totalMemory;
0452     cudaSetDevice(i);
0453     cudaMemGetInfo(&freeMemory, &totalMemory);
0454     edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
0455                                  << totalMemory / (1 << 20) << " MB total memory";
0456     if (freeMemory > maxFreeMemory) {
0457       maxFreeMemory = freeMemory;
0458       device = i;
0459     }
0460   }
0461   // restore the current device
0462   cudaCheck(cudaSetDevice(currentDevice));
0463   return device;
0464 }