File indexing completed on 2023-03-17 11:05:44
0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <stdexcept>
0006 #include <string>
0007 #include <utility>
0008 #include <vector>
0009
0010 #include <cuda.h>
0011 #include <cuda_runtime.h>
0012 #include <nvml.h>
0013
0014 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0015 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0016 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0017 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0018 #include "FWCore/ServiceRegistry/interface/Service.h"
0019 #include "FWCore/Utilities/interface/ResourceInformation.h"
0020 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0021 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0030
0031 class CUDAService : public CUDAInterface {
0032 public:
0033 CUDAService(edm::ParameterSet const& config);
0034 ~CUDAService() override;
0035
0036 static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0037
0038 bool enabled() const final { return enabled_; }
0039
0040 int numberOfDevices() const final { return numberOfDevices_; }
0041
0042
0043 std::pair<int, int> computeCapability(int device) const final {
0044 int size = computeCapabilities_.size();
0045 if (device < 0 or device >= size) {
0046 throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0047 std::to_string(size - 1));
0048 }
0049 return computeCapabilities_[device];
0050 }
0051
0052 private:
0053 int numberOfDevices_ = 0;
0054 std::vector<std::pair<int, int>> computeCapabilities_;
0055 bool enabled_ = false;
0056 bool verbose_ = false;
0057 };
0058
0059 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0060
0061 int device;
0062 cudaCheck(cudaGetDevice(&device));
0063
0064 auto result = cudaDeviceSetLimit(limit, request);
0065 if (cudaErrorUnsupportedLimit == result) {
0066 edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0067 return;
0068 }
0069
0070 size_t value;
0071 result = cudaDeviceGetLimit(&value, limit);
0072 if (cudaSuccess != result) {
0073 edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0074 << request << ", current value is " << value;
0075 } else if (value != request) {
0076 edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0077 << " instead of requested " << request;
0078 }
0079 }
0080
0081 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0082 switch (major * 10 + minor) {
0083
0084 case 20:
0085 return 32;
0086 case 21:
0087 return 48;
0088
0089
0090 case 30:
0091 case 32:
0092 case 35:
0093 case 37:
0094 return 192;
0095
0096
0097 case 50:
0098 case 52:
0099 case 53:
0100 return 128;
0101
0102
0103 case 60:
0104 return 64;
0105 case 61:
0106 case 62:
0107 return 128;
0108
0109
0110 case 70:
0111 case 72:
0112 return 64;
0113
0114
0115 case 75:
0116 return 64;
0117
0118
0119 case 80:
0120 return 64;
0121 case 86:
0122 return 128;
0123
0124
0125 case 89:
0126 return 128;
0127
0128
0129 case 90:
0130 return 128;
0131
0132
0133 default:
0134 return 64;
0135 }
0136 }
0137
0138 std::string decodeVersion(int version) {
0139 return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0140 }
0141
0142 namespace {
0143 template <template <typename> typename UniquePtr, typename Allocate>
0144 void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0145 if (bufferSizes.empty())
0146 return;
0147
0148 auto streamPtr = cms::cuda::getStreamCache().get();
0149
0150 std::vector<UniquePtr<char[]>> buffers;
0151 buffers.reserve(bufferSizes.size());
0152 for (auto size : bufferSizes) {
0153 buffers.push_back(allocate(size, streamPtr.get()));
0154 }
0155 }
0156
0157 void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0158 int device;
0159 cudaCheck(cudaGetDevice(&device));
0160 for (int i = 0; i < numberOfDevices; ++i) {
0161 cudaCheck(cudaSetDevice(i));
0162 preallocate<cms::cuda::device::unique_ptr>(
0163 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0164 bufferSizes);
0165 }
0166 cudaCheck(cudaSetDevice(device));
0167 }
0168
0169 void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0170 preallocate<cms::cuda::host::unique_ptr>(
0171 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0172 bufferSizes);
0173 }
0174 }
0175
0176
0177 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0178 if (not config.getUntrackedParameter<bool>("enabled")) {
0179 edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0180 return;
0181 }
0182
0183 auto status = cudaGetDeviceCount(&numberOfDevices_);
0184 if (cudaSuccess != status) {
0185 edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0186 << "Disabling the CUDAService.";
0187 return;
0188 }
0189 computeCapabilities_.reserve(numberOfDevices_);
0190
0191
0192 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0193 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0194 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0195 nvmlCheck(nvmlShutdown());
0196
0197
0198
0199 int driverVersion = 0;
0200 cudaCheck(cudaDriverGetVersion(&driverVersion));
0201
0202
0203
0204 int runtimeVersion = 0;
0205 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0206
0207 edm::LogInfo log("CUDAService");
0208 if (verbose_) {
0209 log << "NVIDIA driver: " << systemDriverVersion << '\n';
0210 log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0211 << ")\n";
0212 log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0213 << ")\n";
0214 log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0215 } else {
0216 log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0217 << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0218 }
0219
0220 auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0221 auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0222 auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0223 auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0224 auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
0225 auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0226
0227 std::set<std::string> models;
0228
0229 for (int i = 0; i < numberOfDevices_; ++i) {
0230
0231
0232 cudaDeviceProp properties;
0233 cudaCheck(cudaGetDeviceProperties(&properties, i));
0234 log << '\n' << "CUDA device " << i << ": " << properties.name;
0235 if (verbose_) {
0236 log << '\n';
0237 }
0238 models.insert(std::string(properties.name));
0239
0240
0241 computeCapabilities_.emplace_back(properties.major, properties.minor);
0242 if (verbose_) {
0243 log << " compute capability: " << properties.major << "." << properties.minor;
0244 }
0245 log << " (sm_" << properties.major << properties.minor << ")";
0246 if (verbose_) {
0247 log << '\n';
0248 log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0249 log << " CUDA cores: " << std::setw(28)
0250 << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0251 log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0252 << ":1\n";
0253 }
0254
0255
0256 static constexpr const char* computeModeDescription[] = {
0257 "default (shared)",
0258 "exclusive (single thread)",
0259 "prohibited",
0260 "exclusive (single process)",
0261 "unknown"};
0262 if (verbose_) {
0263 log << " compute mode:" << std::right << std::setw(27)
0264 << computeModeDescription[std::min(properties.computeMode,
0265 static_cast<int>(std::size(computeModeDescription)) - 1)]
0266 << '\n';
0267 }
0268
0269
0270 cudaCheck(cudaSetDevice(i));
0271 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0272
0273
0274
0275 if (verbose_) {
0276 size_t freeMemory, totalMemory;
0277 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0278 log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0279 << totalMemory / (1 << 20) << " MB total\n";
0280 log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0281 log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0282 }
0283
0284
0285 if (verbose_) {
0286 static constexpr const char* l1CacheModeDescription[] = {
0287 "unknown", "local memory", "global memory", "local and global memory"};
0288 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0289 log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0290 log << '\n';
0291
0292 log << "Other capabilities\n";
0293 log << " " << (properties.canMapHostMemory ? "can" : "cannot")
0294 << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0295 log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0296 << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0297 log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0298 << " access pageable memory via the host's page tables\n";
0299 log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0300 << " access host registered memory at the same virtual address as the host\n";
0301 log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
0302 << " a unified address space with the host\n";
0303 log << " " << (properties.managedMemory ? "supports" : "does not support")
0304 << " allocating managed memory on this system\n";
0305 log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
0306 << " coherently access managed memory concurrently with the host\n";
0307 log << " "
0308 << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0309 << " directly access managed memory on the device without migration\n";
0310 log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
0311 << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0312 log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0313 << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0314 log << '\n';
0315 }
0316
0317
0318
0319 if (verbose_) {
0320 log << "CUDA flags\n";
0321 unsigned int flags;
0322 cudaCheck(cudaGetDeviceFlags(&flags));
0323 switch (flags & cudaDeviceScheduleMask) {
0324 case cudaDeviceScheduleAuto:
0325 log << " thread policy: default\n";
0326 break;
0327 case cudaDeviceScheduleSpin:
0328 log << " thread policy: spin\n";
0329 break;
0330 case cudaDeviceScheduleYield:
0331 log << " thread policy: yield\n";
0332 break;
0333 case cudaDeviceScheduleBlockingSync:
0334 log << " thread policy: blocking sync\n";
0335 break;
0336 default:
0337 log << " thread policy: undefined\n";
0338 }
0339 if (flags & cudaDeviceMapHost) {
0340 log << " pinned host memory allocations: enabled\n";
0341 } else {
0342 log << " pinned host memory allocations: disabled\n";
0343 }
0344 if (flags & cudaDeviceLmemResizeToMax) {
0345 log << " kernel host memory reuse: enabled\n";
0346 } else {
0347 log << " kernel host memory reuse: disabled\n";
0348 }
0349 log << '\n';
0350 }
0351
0352
0353
0354
0355
0356
0357 if (printfFifoSize >= 0) {
0358 setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0359 }
0360
0361 if (stackSize >= 0) {
0362 setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0363 }
0364
0365
0366 if (mallocHeapSize >= 0) {
0367 setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0368 }
0369 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0370
0371
0372 if (devRuntimeSyncDepth >= 0) {
0373 setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0374 }
0375
0376
0377 if (devRuntimePendingLaunchCount >= 0) {
0378 setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0379 "cudaLimitDevRuntimePendingLaunchCount",
0380 devRuntimePendingLaunchCount);
0381 }
0382 }
0383
0384 if (verbose_) {
0385 size_t value;
0386 log << "CUDA limits\n";
0387 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0388 log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0389 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0390 log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
0391 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0392 log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0393 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0394 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
0395 log << " runtime sync depth: " << std::setw(10) << value << '\n';
0396 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0397 log << " runtime pending launch count: " << std::setw(10) << value << '\n';
0398 }
0399 }
0400 }
0401
0402 edm::Service<edm::ResourceInformation> resourceInformationService;
0403 if (resourceInformationService.isAvailable()) {
0404 std::vector<std::string> modelsV(models.begin(), models.end());
0405 resourceInformationService->setGPUModels(modelsV);
0406 std::string nvidiaDriverVersion{systemDriverVersion};
0407 resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0408 resourceInformationService->setCudaDriverVersion(driverVersion);
0409 resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0410 }
0411
0412
0413 if constexpr (cms::cuda::allocator::useCaching) {
0414 cms::cuda::allocator::cachingAllocatorsConstruct();
0415 }
0416 cms::cuda::getEventCache().clear();
0417 cms::cuda::getStreamCache().clear();
0418
0419 if (verbose_) {
0420 log << '\n' << "CUDAService fully initialized";
0421 }
0422 enabled_ = true;
0423
0424
0425 auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0426 devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
0427 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
0428 }
0429
0430 CUDAService::~CUDAService() {
0431 if (enabled_) {
0432
0433 if constexpr (cms::cuda::allocator::useCaching) {
0434 cms::cuda::allocator::cachingAllocatorsFreeCached();
0435 }
0436 cms::cuda::getEventCache().clear();
0437 cms::cuda::getStreamCache().clear();
0438
0439 for (int i = 0; i < numberOfDevices_; ++i) {
0440 cudaCheck(cudaSetDevice(i));
0441 cudaCheck(cudaDeviceSynchronize());
0442
0443
0444
0445 cudaDeviceReset();
0446 }
0447 }
0448 }
0449
0450 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0451 edm::ParameterSetDescription desc;
0452 desc.addUntracked<bool>("enabled", true);
0453 desc.addUntracked<bool>("verbose", false);
0454
0455 edm::ParameterSetDescription limits;
0456 limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0457 ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0458 limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0459 limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0460 ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0461 limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
0462 ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
0463 limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0464 ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0465 desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0466 ->setComment(
0467 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0468 "the default value.");
0469
0470 edm::ParameterSetDescription allocator;
0471 allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
0472 ->setComment("Preallocates buffers of given bytes on all devices");
0473 allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
0474 ->setComment("Preallocates buffers of given bytes on the host");
0475 desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0476
0477 descriptions.add("CUDAService", desc);
0478 }
0479
0480 namespace edm {
0481 namespace service {
0482 inline bool isProcessWideService(CUDAService const*) { return true; }
0483 }
0484 }
0485
0486 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0487 using CUDAServiceMaker = edm::serviceregistry::ParameterSetMaker<CUDAInterface, CUDAService>;
0488 DEFINE_FWK_SERVICE_MAKER(CUDAService, CUDAServiceMaker);