File indexing completed on 2022-12-19 23:38:32
0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <string>
0006 #include <vector>
0007
0008 #include <cuda.h>
0009 #include <cuda_runtime.h>
0010 #include <nvml.h>
0011
0012 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0013 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0014 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0015 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0016 #include "FWCore/ServiceRegistry/interface/Service.h"
0017 #include "FWCore/Utilities/interface/ResourceInformation.h"
0018 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0019 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
0020 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0021 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0028
0029 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0030
0031 int device;
0032 cudaCheck(cudaGetDevice(&device));
0033
0034 auto result = cudaDeviceSetLimit(limit, request);
0035 if (cudaErrorUnsupportedLimit == result) {
0036 edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0037 return;
0038 }
0039
0040 size_t value;
0041 result = cudaDeviceGetLimit(&value, limit);
0042 if (cudaSuccess != result) {
0043 edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0044 << request << ", current value is " << value;
0045 } else if (value != request) {
0046 edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0047 << " instead of requested " << request;
0048 }
0049 }
0050
0051 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0052 switch (major * 10 + minor) {
0053
0054 case 20:
0055 return 32;
0056 case 21:
0057 return 48;
0058
0059
0060 case 30:
0061 case 32:
0062 case 35:
0063 case 37:
0064 return 192;
0065
0066
0067 case 50:
0068 case 52:
0069 case 53:
0070 return 128;
0071
0072
0073 case 60:
0074 return 64;
0075 case 61:
0076 case 62:
0077 return 128;
0078
0079
0080 case 70:
0081 case 72:
0082 return 64;
0083
0084
0085 case 75:
0086 return 64;
0087
0088
0089 case 80:
0090 return 64;
0091 case 86:
0092 return 128;
0093
0094
0095 default:
0096 return 64;
0097 }
0098 }
0099
0100 std::string decodeVersion(int version) {
0101 return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0102 }
0103
0104 namespace {
0105 template <template <typename> typename UniquePtr, typename Allocate>
0106 void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0107 if (bufferSizes.empty())
0108 return;
0109
0110 auto streamPtr = cms::cuda::getStreamCache().get();
0111
0112 std::vector<UniquePtr<char[]> > buffers;
0113 buffers.reserve(bufferSizes.size());
0114 for (auto size : bufferSizes) {
0115 buffers.push_back(allocate(size, streamPtr.get()));
0116 }
0117 }
0118
0119 void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0120 int device;
0121 cudaCheck(cudaGetDevice(&device));
0122 for (int i = 0; i < numberOfDevices; ++i) {
0123 cudaCheck(cudaSetDevice(i));
0124 preallocate<cms::cuda::device::unique_ptr>(
0125 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0126 bufferSizes);
0127 }
0128 cudaCheck(cudaSetDevice(device));
0129 }
0130
0131 void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0132 preallocate<cms::cuda::host::unique_ptr>(
0133 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0134 bufferSizes);
0135 }
0136 }
0137
0138
0139 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0140 bool configEnabled = config.getUntrackedParameter<bool>("enabled");
0141 if (not configEnabled) {
0142 edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0143 return;
0144 }
0145
0146 auto status = cudaGetDeviceCount(&numberOfDevices_);
0147 if (cudaSuccess != status) {
0148 edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0149 << "Disabling the CUDAService.";
0150 return;
0151 }
0152 computeCapabilities_.reserve(numberOfDevices_);
0153
0154
0155 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0156 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0157 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0158 nvmlCheck(nvmlShutdown());
0159
0160
0161
0162 int driverVersion = 0;
0163 cudaCheck(cudaDriverGetVersion(&driverVersion));
0164
0165
0166
0167 int runtimeVersion = 0;
0168 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0169
0170 edm::LogInfo log("CUDAService");
0171 if (verbose_) {
0172 log << "NVIDIA driver: " << systemDriverVersion << '\n';
0173 log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0174 << ")\n";
0175 log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0176 << ")\n";
0177 log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0178 } else {
0179 log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0180 << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0181 }
0182
0183 auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0184 auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0185 auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0186 auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0187 auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
0188 auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0189
0190 std::set<std::string> models;
0191
0192 for (int i = 0; i < numberOfDevices_; ++i) {
0193
0194
0195 cudaDeviceProp properties;
0196 cudaCheck(cudaGetDeviceProperties(&properties, i));
0197 log << '\n' << "CUDA device " << i << ": " << properties.name;
0198 if (verbose_) {
0199 log << '\n';
0200 }
0201 models.insert(std::string(properties.name));
0202
0203
0204 computeCapabilities_.emplace_back(properties.major, properties.minor);
0205 if (verbose_) {
0206 log << " compute capability: " << properties.major << "." << properties.minor;
0207 }
0208 log << " (sm_" << properties.major << properties.minor << ")";
0209 if (verbose_) {
0210 log << '\n';
0211 log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0212 log << " CUDA cores: " << std::setw(28)
0213 << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0214 log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0215 << ":1\n";
0216 }
0217
0218
0219 static constexpr const char* computeModeDescription[] = {
0220 "default (shared)",
0221 "exclusive (single thread)",
0222 "prohibited",
0223 "exclusive (single process)",
0224 "unknown"};
0225 if (verbose_) {
0226 log << " compute mode:" << std::right << std::setw(27)
0227 << computeModeDescription[std::min(properties.computeMode,
0228 static_cast<int>(std::size(computeModeDescription)) - 1)]
0229 << '\n';
0230 }
0231
0232
0233 cudaCheck(cudaSetDevice(i));
0234 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0235
0236
0237
0238 if (verbose_) {
0239 size_t freeMemory, totalMemory;
0240 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0241 log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0242 << totalMemory / (1 << 20) << " MB total\n";
0243 log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0244 log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0245 }
0246
0247
0248 if (verbose_) {
0249 static constexpr const char* l1CacheModeDescription[] = {
0250 "unknown", "local memory", "global memory", "local and global memory"};
0251 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0252 log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0253 log << '\n';
0254
0255 log << "Other capabilities\n";
0256 log << " " << (properties.canMapHostMemory ? "can" : "cannot")
0257 << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0258 log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0259 << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0260 log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0261 << " access pageable memory via the host's page tables\n";
0262 log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0263 << " access host registered memory at the same virtual address as the host\n";
0264 log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
0265 << " a unified address space with the host\n";
0266 log << " " << (properties.managedMemory ? "supports" : "does not support")
0267 << " allocating managed memory on this system\n";
0268 log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
0269 << " coherently access managed memory concurrently with the host\n";
0270 log << " "
0271 << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0272 << " directly access managed memory on the device without migration\n";
0273 log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
0274 << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0275 log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0276 << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0277 log << '\n';
0278 }
0279
0280
0281
0282 if (verbose_) {
0283 log << "CUDA flags\n";
0284 unsigned int flags;
0285 cudaCheck(cudaGetDeviceFlags(&flags));
0286 switch (flags & cudaDeviceScheduleMask) {
0287 case cudaDeviceScheduleAuto:
0288 log << " thread policy: default\n";
0289 break;
0290 case cudaDeviceScheduleSpin:
0291 log << " thread policy: spin\n";
0292 break;
0293 case cudaDeviceScheduleYield:
0294 log << " thread policy: yield\n";
0295 break;
0296 case cudaDeviceScheduleBlockingSync:
0297 log << " thread policy: blocking sync\n";
0298 break;
0299 default:
0300 log << " thread policy: undefined\n";
0301 }
0302 if (flags & cudaDeviceMapHost) {
0303 log << " pinned host memory allocations: enabled\n";
0304 } else {
0305 log << " pinned host memory allocations: disabled\n";
0306 }
0307 if (flags & cudaDeviceLmemResizeToMax) {
0308 log << " kernel host memory reuse: enabled\n";
0309 } else {
0310 log << " kernel host memory reuse: disabled\n";
0311 }
0312 log << '\n';
0313 }
0314
0315
0316
0317
0318
0319
0320 if (printfFifoSize >= 0) {
0321 setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0322 }
0323
0324 if (stackSize >= 0) {
0325 setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0326 }
0327
0328
0329 if (mallocHeapSize >= 0) {
0330 setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0331 }
0332 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0333
0334
0335 if (devRuntimeSyncDepth >= 0) {
0336 setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0337 }
0338
0339
0340 if (devRuntimePendingLaunchCount >= 0) {
0341 setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0342 "cudaLimitDevRuntimePendingLaunchCount",
0343 devRuntimePendingLaunchCount);
0344 }
0345 }
0346
0347 if (verbose_) {
0348 size_t value;
0349 log << "CUDA limits\n";
0350 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0351 log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0352 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0353 log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
0354 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0355 log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0356 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0357 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
0358 log << " runtime sync depth: " << std::setw(10) << value << '\n';
0359 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0360 log << " runtime pending launch count: " << std::setw(10) << value << '\n';
0361 }
0362 }
0363 }
0364
0365 edm::Service<edm::ResourceInformation> resourceInformationService;
0366 if (resourceInformationService.isAvailable()) {
0367 std::vector<std::string> modelsV(models.begin(), models.end());
0368 resourceInformationService->setGPUModels(modelsV);
0369 std::string nvidiaDriverVersion{systemDriverVersion};
0370 resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0371 resourceInformationService->setCudaDriverVersion(driverVersion);
0372 resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0373 }
0374
0375
0376 if constexpr (cms::cuda::allocator::useCaching) {
0377 cms::cuda::allocator::cachingAllocatorsConstruct();
0378 }
0379 cms::cuda::getEventCache().clear();
0380 cms::cuda::getStreamCache().clear();
0381
0382 if (verbose_) {
0383 log << '\n' << "CUDAService fully initialized";
0384 }
0385 enabled_ = true;
0386
0387
0388 auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0389 devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
0390 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
0391 }
0392
0393 CUDAService::~CUDAService() {
0394 if (enabled_) {
0395
0396 if constexpr (cms::cuda::allocator::useCaching) {
0397 cms::cuda::allocator::cachingAllocatorsFreeCached();
0398 }
0399 cms::cuda::getEventCache().clear();
0400 cms::cuda::getStreamCache().clear();
0401
0402 for (int i = 0; i < numberOfDevices_; ++i) {
0403 cudaCheck(cudaSetDevice(i));
0404 cudaCheck(cudaDeviceSynchronize());
0405
0406
0407
0408 cudaDeviceReset();
0409 }
0410 }
0411 }
0412
0413 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0414 edm::ParameterSetDescription desc;
0415 desc.addUntracked<bool>("enabled", true);
0416 desc.addUntracked<bool>("verbose", false);
0417
0418 edm::ParameterSetDescription limits;
0419 limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0420 ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0421 limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0422 limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0423 ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0424 limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
0425 ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
0426 limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0427 ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0428 desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0429 ->setComment(
0430 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0431 "the default value.");
0432
0433 edm::ParameterSetDescription allocator;
0434 allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
0435 ->setComment("Preallocates buffers of given bytes on all devices");
0436 allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
0437 ->setComment("Preallocates buffers of given bytes on the host");
0438 desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0439
0440 descriptions.add("CUDAService", desc);
0441 }
0442
0443 int CUDAService::deviceWithMostFreeMemory() const {
0444
0445 int currentDevice;
0446 cudaCheck(cudaGetDevice(¤tDevice));
0447
0448 size_t maxFreeMemory = 0;
0449 int device = -1;
0450 for (int i = 0; i < numberOfDevices_; ++i) {
0451 size_t freeMemory, totalMemory;
0452 cudaSetDevice(i);
0453 cudaMemGetInfo(&freeMemory, &totalMemory);
0454 edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
0455 << totalMemory / (1 << 20) << " MB total memory";
0456 if (freeMemory > maxFreeMemory) {
0457 maxFreeMemory = freeMemory;
0458 device = i;
0459 }
0460 }
0461
0462 cudaCheck(cudaSetDevice(currentDevice));
0463 return device;
0464 }