Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2021-09-21 04:25:40

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 
0005 #include <cuda.h>
0006 #include <cuda_runtime.h>
0007 #include <nvml.h>
0008 
0009 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0010 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0011 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0012 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0013 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0014 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
0015 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0016 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0017 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0018 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0019 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0021 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0023 
0024 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0025   // read the current device
0026   int device;
0027   cudaCheck(cudaGetDevice(&device));
0028   // try to set the requested limit
0029   auto result = cudaDeviceSetLimit(limit, request);
0030   if (cudaErrorUnsupportedLimit == result) {
0031     edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0032     return;
0033   }
0034   // read back the limit value
0035   size_t value;
0036   result = cudaDeviceGetLimit(&value, limit);
0037   if (cudaSuccess != result) {
0038     edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0039                                    << request << ", current value is " << value;
0040   } else if (value != request) {
0041     edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0042                                    << " instead of requested " << request;
0043   }
0044 }
0045 
0046 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0047   switch (major * 10 + minor) {
0048     // Fermi architecture
0049     case 20:  // SM 2.0: GF100 class
0050       return 32;
0051     case 21:  // SM 2.1: GF10x class
0052       return 48;
0053 
0054     // Kepler architecture
0055     case 30:  // SM 3.0: GK10x class
0056     case 32:  // SM 3.2: GK10x class
0057     case 35:  // SM 3.5: GK11x class
0058     case 37:  // SM 3.7: GK21x class
0059       return 192;
0060 
0061     // Maxwell architecture
0062     case 50:  // SM 5.0: GM10x class
0063     case 52:  // SM 5.2: GM20x class
0064     case 53:  // SM 5.3: GM20x class
0065       return 128;
0066 
0067     // Pascal architecture
0068     case 60:  // SM 6.0: GP100 class
0069       return 64;
0070     case 61:  // SM 6.1: GP10x class
0071     case 62:  // SM 6.2: GP10x class
0072       return 128;
0073 
0074     // Volta architecture
0075     case 70:  // SM 7.0: GV100 class
0076     case 72:  // SM 7.2: GV11b class
0077       return 64;
0078 
0079     // Turing architecture
0080     case 75:  // SM 7.5: TU10x class
0081       return 64;
0082 
0083     // Ampere architecture
0084     case 80:  // SM 8.0: GA100 class
0085       return 64;
0086     case 86:  // SM 8.6: GA10x class
0087       return 128;
0088 
0089     // unknown architecture, return a default value
0090     default:
0091       return 64;
0092   }
0093 }
0094 
0095 std::string decodeVersion(int version) {
0096   return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0097 }
0098 
0099 namespace {
0100   template <template <typename> typename UniquePtr, typename Allocate>
0101   void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0102     if (bufferSizes.empty())
0103       return;
0104 
0105     auto streamPtr = cms::cuda::getStreamCache().get();
0106 
0107     std::vector<UniquePtr<char[]> > buffers;
0108     buffers.reserve(bufferSizes.size());
0109     for (auto size : bufferSizes) {
0110       buffers.push_back(allocate(size, streamPtr.get()));
0111     }
0112   }
0113 
0114   void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0115     int device;
0116     cudaCheck(cudaGetDevice(&device));
0117     for (int i = 0; i < numberOfDevices; ++i) {
0118       cudaCheck(cudaSetDevice(i));
0119       preallocate<cms::cuda::device::unique_ptr>(
0120           [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0121           bufferSizes);
0122     }
0123     cudaCheck(cudaSetDevice(device));
0124   }
0125 
0126   void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0127     preallocate<cms::cuda::host::unique_ptr>(
0128         [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0129         bufferSizes);
0130   }
0131 }  // namespace
0132 
0133 /// Constructor
0134 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0135   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
0136   if (not configEnabled) {
0137     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0138     return;
0139   }
0140 
0141   auto status = cudaGetDeviceCount(&numberOfDevices_);
0142   if (cudaSuccess != status) {
0143     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0144                                    << "Disabling the CUDAService.";
0145     return;
0146   }
0147   computeCapabilities_.reserve(numberOfDevices_);
0148 
0149   // NVIDIA system driver version, e.g. 470.57.02
0150   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0151   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0152   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0153   nvmlCheck(nvmlShutdown());
0154 
0155   // CUDA driver version, e.g. 11.4
0156   // the full version, like 11.4.1 or 11.4.100, is not reported
0157   int driverVersion = 0;
0158   cudaCheck(cudaDriverGetVersion(&driverVersion));
0159 
0160   // CUDA runtime version, e.g. 11.4
0161   // the full version, like 11.4.1 or 11.4.108, is not reported
0162   int runtimeVersion = 0;
0163   cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0164 
0165   edm::LogInfo log("CUDAService");
0166   if (verbose_) {
0167     log << "NVIDIA driver:    " << systemDriverVersion << '\n';
0168     log << "CUDA driver API:  " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0169         << ")\n";
0170     log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0171         << ")\n";
0172     log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0173   } else {
0174     log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0175         << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0176   }
0177 
0178   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0179   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0180   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0181   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0182   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
0183   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0184 
0185   for (int i = 0; i < numberOfDevices_; ++i) {
0186     // read information about the compute device.
0187     // see the documentation of cudaGetDeviceProperties() for more information.
0188     cudaDeviceProp properties;
0189     cudaCheck(cudaGetDeviceProperties(&properties, i));
0190     log << '\n' << "CUDA device " << i << ": " << properties.name;
0191     if (verbose_) {
0192       log << '\n';
0193     }
0194 
0195     // compute capabilities
0196     computeCapabilities_.emplace_back(properties.major, properties.minor);
0197     if (verbose_) {
0198       log << "  compute capability:          " << properties.major << "." << properties.minor;
0199     }
0200     log << " (sm_" << properties.major << properties.minor << ")";
0201     if (verbose_) {
0202       log << '\n';
0203       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0204       log << "  CUDA cores: " << std::setw(28)
0205           << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0206       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0207           << ":1\n";
0208     }
0209 
0210     // compute mode
0211     static constexpr const char* computeModeDescription[] = {
0212         "default (shared)",            // cudaComputeModeDefault
0213         "exclusive (single thread)",   // cudaComputeModeExclusive
0214         "prohibited",                  // cudaComputeModeProhibited
0215         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
0216         "unknown"};
0217     if (verbose_) {
0218       log << "  compute mode:" << std::right << std::setw(27)
0219           << computeModeDescription[std::min(properties.computeMode,
0220                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0221           << '\n';
0222     }
0223 
0224     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
0225     cudaCheck(cudaSetDevice(i));
0226     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0227 
0228     // read the free and total amount of memory available for allocation by the device, in bytes.
0229     // see the documentation of cudaMemGetInfo() for more information.
0230     if (verbose_) {
0231       size_t freeMemory, totalMemory;
0232       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0233       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0234           << totalMemory / (1 << 20) << " MB total\n";
0235       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0236       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0237     }
0238 
0239     // L1 cache behaviour
0240     if (verbose_) {
0241       static constexpr const char* l1CacheModeDescription[] = {
0242           "unknown", "local memory", "global memory", "local and global memory"};
0243       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0244       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0245       log << '\n';
0246 
0247       log << "Other capabilities\n";
0248       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0249           << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0250       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0251           << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0252       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0253           << " access pageable memory via the host's page tables\n";
0254       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0255           << " access host registered memory at the same virtual address as the host\n";
0256       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
0257           << " a unified address space with the host\n";
0258       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0259           << " allocating managed memory on this system\n";
0260       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0261           << " coherently access managed memory concurrently with the host\n";
0262       log << "  "
0263           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0264           << " directly access managed memory on the device without migration\n";
0265       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0266           << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0267       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0268           << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0269       log << '\n';
0270     }
0271 
0272     // set and read the CUDA device flags.
0273     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
0274     if (verbose_) {
0275       log << "CUDA flags\n";
0276       unsigned int flags;
0277       cudaCheck(cudaGetDeviceFlags(&flags));
0278       switch (flags & cudaDeviceScheduleMask) {
0279         case cudaDeviceScheduleAuto:
0280           log << "  thread policy:                   default\n";
0281           break;
0282         case cudaDeviceScheduleSpin:
0283           log << "  thread policy:                      spin\n";
0284           break;
0285         case cudaDeviceScheduleYield:
0286           log << "  thread policy:                     yield\n";
0287           break;
0288         case cudaDeviceScheduleBlockingSync:
0289           log << "  thread policy:             blocking sync\n";
0290           break;
0291         default:
0292           log << "  thread policy:                 undefined\n";
0293       }
0294       if (flags & cudaDeviceMapHost) {
0295         log << "  pinned host memory allocations:  enabled\n";
0296       } else {
0297         log << "  pinned host memory allocations: disabled\n";
0298       }
0299       if (flags & cudaDeviceLmemResizeToMax) {
0300         log << "  kernel host memory reuse:        enabled\n";
0301       } else {
0302         log << "  kernel host memory reuse:       disabled\n";
0303       }
0304       log << '\n';
0305     }
0306 
0307     // set and read the CUDA resource limits.
0308     // see the documentation of cudaDeviceSetLimit() for more information.
0309 
0310     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
0311     // printf() device system call.
0312     if (printfFifoSize >= 0) {
0313       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0314     }
0315     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
0316     if (stackSize >= 0) {
0317       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0318     }
0319     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0320     // and free() device system calls.
0321     if (mallocHeapSize >= 0) {
0322       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0323     }
0324     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0325       // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
0326       // a thread can safely call cudaDeviceSynchronize().
0327       if (devRuntimeSyncDepth >= 0) {
0328         setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0329       }
0330       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
0331       // device runtime launches that can be made from the current device.
0332       if (devRuntimePendingLaunchCount >= 0) {
0333         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0334                      "cudaLimitDevRuntimePendingLaunchCount",
0335                      devRuntimePendingLaunchCount);
0336       }
0337     }
0338 
0339     if (verbose_) {
0340       size_t value;
0341       log << "CUDA limits\n";
0342       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0343       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
0344       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0345       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0346       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0347       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0348       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0349         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
0350         log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
0351         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0352         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
0353       }
0354     }
0355   }
0356 
0357   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
0358   if constexpr (cms::cuda::allocator::useCaching) {
0359     cms::cuda::allocator::cachingAllocatorsConstruct();
0360   }
0361   cms::cuda::getEventCache().clear();
0362   cms::cuda::getStreamCache().clear();
0363 
0364   if (verbose_) {
0365     log << '\n' << "CUDAService fully initialized";
0366   }
0367   enabled_ = true;
0368 
0369   // Preallocate buffers if asked to
0370   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0371   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
0372   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
0373 }
0374 
0375 CUDAService::~CUDAService() {
0376   if (enabled_) {
0377     // Explicitly destruct the allocator before the device resets below
0378     if constexpr (cms::cuda::allocator::useCaching) {
0379       cms::cuda::allocator::cachingAllocatorsFreeCached();
0380     }
0381     cms::cuda::getEventCache().clear();
0382     cms::cuda::getStreamCache().clear();
0383 
0384     for (int i = 0; i < numberOfDevices_; ++i) {
0385       cudaCheck(cudaSetDevice(i));
0386       cudaCheck(cudaDeviceSynchronize());
0387       // Explicitly destroys and cleans up all resources associated with the current device in the
0388       // current process. Any subsequent API call to this device will reinitialize the device.
0389       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
0390       cudaDeviceReset();
0391     }
0392   }
0393 }
0394 
0395 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0396   edm::ParameterSetDescription desc;
0397   desc.addUntracked<bool>("enabled", true);
0398   desc.addUntracked<bool>("verbose", false);
0399 
0400   edm::ParameterSetDescription limits;
0401   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0402       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0403   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0404   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0405       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0406   limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
0407       ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
0408   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0409       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0410   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0411       ->setComment(
0412           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0413           "the default value.");
0414 
0415   edm::ParameterSetDescription allocator;
0416   allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
0417       ->setComment("Preallocates buffers of given bytes on all devices");
0418   allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
0419       ->setComment("Preallocates buffers of given bytes on the host");
0420   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0421 
0422   descriptions.add("CUDAService", desc);
0423 }
0424 
0425 int CUDAService::deviceWithMostFreeMemory() const {
0426   // save the current device
0427   int currentDevice;
0428   cudaCheck(cudaGetDevice(&currentDevice));
0429 
0430   size_t maxFreeMemory = 0;
0431   int device = -1;
0432   for (int i = 0; i < numberOfDevices_; ++i) {
0433     size_t freeMemory, totalMemory;
0434     cudaSetDevice(i);
0435     cudaMemGetInfo(&freeMemory, &totalMemory);
0436     edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
0437                                  << totalMemory / (1 << 20) << " MB total memory";
0438     if (freeMemory > maxFreeMemory) {
0439       maxFreeMemory = freeMemory;
0440       device = i;
0441     }
0442   }
0443   // restore the current device
0444   cudaCheck(cudaSetDevice(currentDevice));
0445   return device;
0446 }