File indexing completed on 2025-02-12 04:02:30
0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <stdexcept>
0006 #include <string>
0007 #include <utility>
0008 #include <vector>
0009
0010 #include <cuda.h>
0011 #include <cuda_runtime.h>
0012 #include <nvml.h>
0013
0014 #include "FWCore/AbstractServices/interface/ResourceInformation.h"
0015 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0016 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0017 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0018 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0019 #include "FWCore/ServiceRegistry/interface/Service.h"
0020 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0021 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0030
0031 class CUDAService : public CUDAInterface {
0032 public:
0033 CUDAService(edm::ParameterSet const& config);
0034 ~CUDAService() override;
0035
0036 static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0037
0038 bool enabled() const final { return enabled_; }
0039
0040 int numberOfDevices() const final { return numberOfDevices_; }
0041
0042
0043 std::pair<int, int> computeCapability(int device) const final {
0044 int size = computeCapabilities_.size();
0045 if (device < 0 or device >= size) {
0046 throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0047 std::to_string(size - 1));
0048 }
0049 return computeCapabilities_[device];
0050 }
0051
0052 private:
0053 int numberOfDevices_ = 0;
0054 std::vector<std::pair<int, int>> computeCapabilities_;
0055 bool enabled_ = false;
0056 bool verbose_ = false;
0057 };
0058
0059 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0060
0061 int device;
0062 cudaCheck(cudaGetDevice(&device));
0063
0064 auto result = cudaDeviceSetLimit(limit, request);
0065 if (cudaErrorUnsupportedLimit == result) {
0066 edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0067 return;
0068 }
0069
0070 size_t value;
0071 result = cudaDeviceGetLimit(&value, limit);
0072 if (cudaSuccess != result) {
0073 edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0074 << request << ", current value is " << value;
0075 } else if (value != request) {
0076 edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0077 << " instead of requested " << request;
0078 }
0079 }
0080
0081 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0082 switch (major * 10 + minor) {
0083
0084 case 20:
0085 return 32;
0086 case 21:
0087 return 48;
0088
0089
0090 case 30:
0091 case 32:
0092 case 35:
0093 case 37:
0094 return 192;
0095
0096
0097 case 50:
0098 case 52:
0099 case 53:
0100 return 128;
0101
0102
0103 case 60:
0104 return 64;
0105 case 61:
0106 case 62:
0107 return 128;
0108
0109
0110 case 70:
0111 case 72:
0112 return 64;
0113
0114
0115 case 75:
0116 return 64;
0117
0118
0119 case 80:
0120 return 64;
0121 case 86:
0122 return 128;
0123
0124
0125 case 89:
0126 return 128;
0127
0128
0129 case 90:
0130 return 128;
0131
0132
0133 default:
0134 return 64;
0135 }
0136 }
0137
0138 std::string decodeVersion(int version) {
0139 return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0140 }
0141
0142 namespace {
0143 template <template <typename> typename UniquePtr, typename Allocate>
0144 void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0145 if (bufferSizes.empty())
0146 return;
0147
0148 auto streamPtr = cms::cuda::getStreamCache().get();
0149
0150 std::vector<UniquePtr<char[]>> buffers;
0151 buffers.reserve(bufferSizes.size());
0152 for (auto size : bufferSizes) {
0153 buffers.push_back(allocate(size, streamPtr.get()));
0154 }
0155 }
0156
0157 void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0158 int device;
0159 cudaCheck(cudaGetDevice(&device));
0160 for (int i = 0; i < numberOfDevices; ++i) {
0161 cudaCheck(cudaSetDevice(i));
0162 preallocate<cms::cuda::device::unique_ptr>(
0163 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0164 bufferSizes);
0165 }
0166 cudaCheck(cudaSetDevice(device));
0167 }
0168
0169 void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0170 preallocate<cms::cuda::host::unique_ptr>(
0171 [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0172 bufferSizes);
0173 }
0174 }
0175
0176
0177 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0178 if (not config.getUntrackedParameter<bool>("enabled")) {
0179 edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0180 return;
0181 }
0182
0183 auto status = cudaGetDeviceCount(&numberOfDevices_);
0184 if (cudaSuccess != status) {
0185 edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0186 << "Disabling the CUDAService.";
0187 return;
0188 }
0189 computeCapabilities_.reserve(numberOfDevices_);
0190
0191
0192 char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0193 nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0194 nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0195 nvmlCheck(nvmlShutdown());
0196
0197
0198
0199 int driverVersion = 0;
0200 cudaCheck(cudaDriverGetVersion(&driverVersion));
0201
0202
0203
0204 int runtimeVersion = 0;
0205 cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0206
0207 edm::LogInfo log("CUDAService");
0208 if (verbose_) {
0209 log << "NVIDIA driver: " << systemDriverVersion << '\n';
0210 log << "CUDA driver API: " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0211 << ")\n";
0212 log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0213 << ")\n";
0214 log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0215 } else {
0216 log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0217 << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0218 }
0219
0220 auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0221 auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0222 auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0223 auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0224 auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0225
0226 std::set<std::string> models;
0227
0228 for (int i = 0; i < numberOfDevices_; ++i) {
0229
0230
0231 cudaDeviceProp properties;
0232 cudaCheck(cudaGetDeviceProperties(&properties, i));
0233 log << '\n' << "CUDA device " << i << ": " << properties.name;
0234 if (verbose_) {
0235 log << '\n';
0236 }
0237 models.insert(std::string(properties.name));
0238
0239
0240 computeCapabilities_.emplace_back(properties.major, properties.minor);
0241 if (verbose_) {
0242 log << " compute capability: " << properties.major << "." << properties.minor;
0243 }
0244 log << " (sm_" << properties.major << properties.minor << ")";
0245 if (verbose_) {
0246 log << '\n';
0247 log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0248 log << " CUDA cores: " << std::setw(28)
0249 << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0250 log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0251 << ":1\n";
0252 }
0253
0254
0255 static constexpr const char* computeModeDescription[] = {
0256 "default (shared)",
0257 "exclusive (single thread)",
0258 "prohibited",
0259 "exclusive (single process)",
0260 "unknown"};
0261 if (verbose_) {
0262 log << " compute mode:" << std::right << std::setw(27)
0263 << computeModeDescription[std::min(properties.computeMode,
0264 static_cast<int>(std::size(computeModeDescription)) - 1)]
0265 << '\n';
0266 }
0267
0268
0269 cudaCheck(cudaSetDevice(i));
0270 cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0271
0272
0273
0274 if (verbose_) {
0275 size_t freeMemory, totalMemory;
0276 cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0277 log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0278 << totalMemory / (1 << 20) << " MB total\n";
0279 log << " constant memory: " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0280 log << " L2 cache size: " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0281 }
0282
0283
0284 if (verbose_) {
0285 static constexpr const char* l1CacheModeDescription[] = {
0286 "unknown", "local memory", "global memory", "local and global memory"};
0287 int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0288 log << " L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0289 log << '\n';
0290
0291 log << "Other capabilities\n";
0292 log << " " << (properties.canMapHostMemory ? "can" : "cannot")
0293 << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0294 log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0295 << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0296 log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0297 << " access pageable memory via the host's page tables\n";
0298 log << " " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0299 << " access host registered memory at the same virtual address as the host\n";
0300 log << " " << (properties.unifiedAddressing ? "shares" : "does not share")
0301 << " a unified address space with the host\n";
0302 log << " " << (properties.managedMemory ? "supports" : "does not support")
0303 << " allocating managed memory on this system\n";
0304 log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
0305 << " coherently access managed memory concurrently with the host\n";
0306 log << " "
0307 << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0308 << " directly access managed memory on the device without migration\n";
0309 log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
0310 << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0311 log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0312 << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0313 log << '\n';
0314 }
0315
0316
0317
0318 if (verbose_) {
0319 log << "CUDA flags\n";
0320 unsigned int flags;
0321 cudaCheck(cudaGetDeviceFlags(&flags));
0322 switch (flags & cudaDeviceScheduleMask) {
0323 case cudaDeviceScheduleAuto:
0324 log << " thread policy: default\n";
0325 break;
0326 case cudaDeviceScheduleSpin:
0327 log << " thread policy: spin\n";
0328 break;
0329 case cudaDeviceScheduleYield:
0330 log << " thread policy: yield\n";
0331 break;
0332 case cudaDeviceScheduleBlockingSync:
0333 log << " thread policy: blocking sync\n";
0334 break;
0335 default:
0336 log << " thread policy: undefined\n";
0337 }
0338 if (flags & cudaDeviceMapHost) {
0339 log << " pinned host memory allocations: enabled\n";
0340 } else {
0341 log << " pinned host memory allocations: disabled\n";
0342 }
0343 if (flags & cudaDeviceLmemResizeToMax) {
0344 log << " kernel host memory reuse: enabled\n";
0345 } else {
0346 log << " kernel host memory reuse: disabled\n";
0347 }
0348 log << '\n';
0349 }
0350
0351
0352
0353
0354
0355
0356 if (printfFifoSize >= 0) {
0357 setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0358 }
0359
0360 if (stackSize >= 0) {
0361 setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0362 }
0363
0364
0365 if (mallocHeapSize >= 0) {
0366 setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0367 }
0368 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0369
0370
0371 if (devRuntimePendingLaunchCount >= 0) {
0372 setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0373 "cudaLimitDevRuntimePendingLaunchCount",
0374 devRuntimePendingLaunchCount);
0375 }
0376 }
0377
0378 if (verbose_) {
0379 size_t value;
0380 log << "CUDA limits\n";
0381 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0382 log << " printf buffer size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0383 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0384 log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
0385 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0386 log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0387 if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0388 cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0389 log << " runtime pending launch count: " << std::setw(10) << value << '\n';
0390 }
0391 }
0392 }
0393
0394 edm::Service<edm::ResourceInformation> resourceInformationService;
0395 if (resourceInformationService.isAvailable()) {
0396 std::vector<std::string> modelsV(models.begin(), models.end());
0397 resourceInformationService->setGPUModels(modelsV);
0398 std::string nvidiaDriverVersion{systemDriverVersion};
0399 resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0400 resourceInformationService->setCudaDriverVersion(driverVersion);
0401 resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0402 }
0403
0404
0405 if constexpr (cms::cuda::allocator::useCaching) {
0406 cms::cuda::allocator::cachingAllocatorsConstruct();
0407 }
0408 cms::cuda::getEventCache().clear();
0409 cms::cuda::getStreamCache().clear();
0410
0411 if (verbose_) {
0412 log << '\n' << "CUDAService fully initialized";
0413 }
0414 enabled_ = true;
0415
0416
0417 auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0418 devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
0419 hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
0420 }
0421
0422 CUDAService::~CUDAService() {
0423 if (enabled_) {
0424
0425 if constexpr (cms::cuda::allocator::useCaching) {
0426 cms::cuda::allocator::cachingAllocatorsFreeCached();
0427 }
0428 cms::cuda::getEventCache().clear();
0429 cms::cuda::getStreamCache().clear();
0430
0431 for (int i = 0; i < numberOfDevices_; ++i) {
0432 cudaCheck(cudaSetDevice(i));
0433 cudaCheck(cudaDeviceSynchronize());
0434
0435
0436
0437 cudaDeviceReset();
0438 }
0439 }
0440 }
0441
0442 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0443 edm::ParameterSetDescription desc;
0444 desc.addUntracked<bool>("enabled", true);
0445 desc.addUntracked<bool>("verbose", false);
0446
0447 edm::ParameterSetDescription limits;
0448 limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0449 ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0450 limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0451 limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0452 ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0453 limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0454 ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0455 desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0456 ->setComment(
0457 "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0458 "the default value.");
0459
0460 edm::ParameterSetDescription allocator;
0461 allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
0462 ->setComment("Preallocates buffers of given bytes on all devices");
0463 allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
0464 ->setComment("Preallocates buffers of given bytes on the host");
0465 desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0466
0467 descriptions.add("CUDAService", desc);
0468 }
0469
0470 namespace edm {
0471 namespace service {
0472 inline bool isProcessWideService(CUDAService const*) { return true; }
0473 }
0474 }
0475
0476 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0477 using CUDAServiceMaker = edm::serviceregistry::ParameterSetMaker<CUDAInterface, CUDAService>;
0478 DEFINE_FWK_SERVICE_MAKER(CUDAService, CUDAServiceMaker);