Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-03-17 11:05:44

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <stdexcept>
0006 #include <string>
0007 #include <utility>
0008 #include <vector>
0009 
0010 #include <cuda.h>
0011 #include <cuda_runtime.h>
0012 #include <nvml.h>
0013 
0014 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0015 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0016 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0017 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0018 #include "FWCore/ServiceRegistry/interface/Service.h"
0019 #include "FWCore/Utilities/interface/ResourceInformation.h"
0020 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
0021 #include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
0022 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
0023 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
0024 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
0025 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
0026 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
0027 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
0028 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
0029 #include "HeterogeneousCore/CUDAUtilities/interface/nvmlCheck.h"
0030 
0031 class CUDAService : public CUDAInterface {
0032 public:
0033   CUDAService(edm::ParameterSet const& config);
0034   ~CUDAService() override;
0035 
0036   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0037 
0038   bool enabled() const final { return enabled_; }
0039 
0040   int numberOfDevices() const final { return numberOfDevices_; }
0041 
0042   // Return the (major, minor) CUDA compute capability of the given device.
0043   std::pair<int, int> computeCapability(int device) const final {
0044     int size = computeCapabilities_.size();
0045     if (device < 0 or device >= size) {
0046       throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0047                               std::to_string(size - 1));
0048     }
0049     return computeCapabilities_[device];
0050   }
0051 
0052 private:
0053   int numberOfDevices_ = 0;
0054   std::vector<std::pair<int, int>> computeCapabilities_;
0055   bool enabled_ = false;
0056   bool verbose_ = false;
0057 };
0058 
0059 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
0060   // read the current device
0061   int device;
0062   cudaCheck(cudaGetDevice(&device));
0063   // try to set the requested limit
0064   auto result = cudaDeviceSetLimit(limit, request);
0065   if (cudaErrorUnsupportedLimit == result) {
0066     edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
0067     return;
0068   }
0069   // read back the limit value
0070   size_t value;
0071   result = cudaDeviceGetLimit(&value, limit);
0072   if (cudaSuccess != result) {
0073     edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
0074                                    << request << ", current value is " << value;
0075   } else if (value != request) {
0076     edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
0077                                    << " instead of requested " << request;
0078   }
0079 }
0080 
0081 constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
0082   switch (major * 10 + minor) {
0083     // Fermi architecture
0084     case 20:  // SM 2.0: GF100 class
0085       return 32;
0086     case 21:  // SM 2.1: GF10x class
0087       return 48;
0088 
0089     // Kepler architecture
0090     case 30:  // SM 3.0: GK10x class
0091     case 32:  // SM 3.2: GK10x class
0092     case 35:  // SM 3.5: GK11x class
0093     case 37:  // SM 3.7: GK21x class
0094       return 192;
0095 
0096     // Maxwell architecture
0097     case 50:  // SM 5.0: GM10x class
0098     case 52:  // SM 5.2: GM20x class
0099     case 53:  // SM 5.3: GM20x class
0100       return 128;
0101 
0102     // Pascal architecture
0103     case 60:  // SM 6.0: GP100 class
0104       return 64;
0105     case 61:  // SM 6.1: GP10x class
0106     case 62:  // SM 6.2: GP10x class
0107       return 128;
0108 
0109     // Volta architecture
0110     case 70:  // SM 7.0: GV100 class
0111     case 72:  // SM 7.2: GV11b class
0112       return 64;
0113 
0114     // Turing architecture
0115     case 75:  // SM 7.5: TU10x class
0116       return 64;
0117 
0118     // Ampere architecture
0119     case 80:  // SM 8.0: GA100 class
0120       return 64;
0121     case 86:  // SM 8.6: GA10x class
0122       return 128;
0123 
0124     // Ada Lovelace architectures
0125     case 89:  // SM 8.9: AD10x class
0126       return 128;
0127 
0128     // Hopper architecture
0129     case 90:  // SM 9.0: GH100 class
0130       return 128;
0131 
0132     // unknown architecture, return a default value
0133     default:
0134       return 64;
0135   }
0136 }
0137 
0138 std::string decodeVersion(int version) {
0139   return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0140 }
0141 
0142 namespace {
0143   template <template <typename> typename UniquePtr, typename Allocate>
0144   void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
0145     if (bufferSizes.empty())
0146       return;
0147 
0148     auto streamPtr = cms::cuda::getStreamCache().get();
0149 
0150     std::vector<UniquePtr<char[]>> buffers;
0151     buffers.reserve(bufferSizes.size());
0152     for (auto size : bufferSizes) {
0153       buffers.push_back(allocate(size, streamPtr.get()));
0154     }
0155   }
0156 
0157   void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
0158     int device;
0159     cudaCheck(cudaGetDevice(&device));
0160     for (int i = 0; i < numberOfDevices; ++i) {
0161       cudaCheck(cudaSetDevice(i));
0162       preallocate<cms::cuda::device::unique_ptr>(
0163           [&](size_t size, cudaStream_t stream) { return cms::cuda::make_device_unique<char[]>(size, stream); },
0164           bufferSizes);
0165     }
0166     cudaCheck(cudaSetDevice(device));
0167   }
0168 
0169   void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
0170     preallocate<cms::cuda::host::unique_ptr>(
0171         [&](size_t size, cudaStream_t stream) { return cms::cuda::make_host_unique<char[]>(size, stream); },
0172         bufferSizes);
0173   }
0174 }  // namespace
0175 
0176 /// Constructor
0177 CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0178   if (not config.getUntrackedParameter<bool>("enabled")) {
0179     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
0180     return;
0181   }
0182 
0183   auto status = cudaGetDeviceCount(&numberOfDevices_);
0184   if (cudaSuccess != status) {
0185     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
0186                                    << "Disabling the CUDAService.";
0187     return;
0188   }
0189   computeCapabilities_.reserve(numberOfDevices_);
0190 
0191   // NVIDIA system driver version, e.g. 470.57.02
0192   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0193   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0194   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0195   nvmlCheck(nvmlShutdown());
0196 
0197   // CUDA driver version, e.g. 11.4
0198   // the full version, like 11.4.1 or 11.4.100, is not reported
0199   int driverVersion = 0;
0200   cudaCheck(cudaDriverGetVersion(&driverVersion));
0201 
0202   // CUDA runtime version, e.g. 11.4
0203   // the full version, like 11.4.1 or 11.4.108, is not reported
0204   int runtimeVersion = 0;
0205   cudaCheck(cudaRuntimeGetVersion(&runtimeVersion));
0206 
0207   edm::LogInfo log("CUDAService");
0208   if (verbose_) {
0209     log << "NVIDIA driver:    " << systemDriverVersion << '\n';
0210     log << "CUDA driver API:  " << decodeVersion(driverVersion) << " (compiled with " << decodeVersion(CUDA_VERSION)
0211         << ")\n";
0212     log << "CUDA runtime API: " << decodeVersion(runtimeVersion) << " (compiled with " << decodeVersion(CUDART_VERSION)
0213         << ")\n";
0214     log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0215   } else {
0216     log << "CUDA runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0217         << decodeVersion(driverVersion) << ", NVIDIA driver version " << systemDriverVersion;
0218   }
0219 
0220   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0221   auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
0222   auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
0223   auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
0224   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
0225   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
0226 
0227   std::set<std::string> models;
0228 
0229   for (int i = 0; i < numberOfDevices_; ++i) {
0230     // read information about the compute device.
0231     // see the documentation of cudaGetDeviceProperties() for more information.
0232     cudaDeviceProp properties;
0233     cudaCheck(cudaGetDeviceProperties(&properties, i));
0234     log << '\n' << "CUDA device " << i << ": " << properties.name;
0235     if (verbose_) {
0236       log << '\n';
0237     }
0238     models.insert(std::string(properties.name));
0239 
0240     // compute capabilities
0241     computeCapabilities_.emplace_back(properties.major, properties.minor);
0242     if (verbose_) {
0243       log << "  compute capability:          " << properties.major << "." << properties.minor;
0244     }
0245     log << " (sm_" << properties.major << properties.minor << ")";
0246     if (verbose_) {
0247       log << '\n';
0248       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0249       log << "  CUDA cores: " << std::setw(28)
0250           << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
0251       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0252           << ":1\n";
0253     }
0254 
0255     // compute mode
0256     static constexpr const char* computeModeDescription[] = {
0257         "default (shared)",            // cudaComputeModeDefault
0258         "exclusive (single thread)",   // cudaComputeModeExclusive
0259         "prohibited",                  // cudaComputeModeProhibited
0260         "exclusive (single process)",  // cudaComputeModeExclusiveProcess
0261         "unknown"};
0262     if (verbose_) {
0263       log << "  compute mode:" << std::right << std::setw(27)
0264           << computeModeDescription[std::min(properties.computeMode,
0265                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0266           << '\n';
0267     }
0268 
0269     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
0270     cudaCheck(cudaSetDevice(i));
0271     cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
0272 
0273     // read the free and total amount of memory available for allocation by the device, in bytes.
0274     // see the documentation of cudaMemGetInfo() for more information.
0275     if (verbose_) {
0276       size_t freeMemory, totalMemory;
0277       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
0278       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0279           << totalMemory / (1 << 20) << " MB total\n";
0280       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0281       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0282     }
0283 
0284     // L1 cache behaviour
0285     if (verbose_) {
0286       static constexpr const char* l1CacheModeDescription[] = {
0287           "unknown", "local memory", "global memory", "local and global memory"};
0288       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0289       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0290       log << '\n';
0291 
0292       log << "Other capabilities\n";
0293       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0294           << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
0295       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0296           << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
0297       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0298           << " access pageable memory via the host's page tables\n";
0299       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0300           << " access host registered memory at the same virtual address as the host\n";
0301       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
0302           << " a unified address space with the host\n";
0303       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0304           << " allocating managed memory on this system\n";
0305       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0306           << " coherently access managed memory concurrently with the host\n";
0307       log << "  "
0308           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0309           << " directly access managed memory on the device without migration\n";
0310       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0311           << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
0312       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0313           << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
0314       log << '\n';
0315     }
0316 
0317     // set and read the CUDA device flags.
0318     // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
0319     if (verbose_) {
0320       log << "CUDA flags\n";
0321       unsigned int flags;
0322       cudaCheck(cudaGetDeviceFlags(&flags));
0323       switch (flags & cudaDeviceScheduleMask) {
0324         case cudaDeviceScheduleAuto:
0325           log << "  thread policy:                   default\n";
0326           break;
0327         case cudaDeviceScheduleSpin:
0328           log << "  thread policy:                      spin\n";
0329           break;
0330         case cudaDeviceScheduleYield:
0331           log << "  thread policy:                     yield\n";
0332           break;
0333         case cudaDeviceScheduleBlockingSync:
0334           log << "  thread policy:             blocking sync\n";
0335           break;
0336         default:
0337           log << "  thread policy:                 undefined\n";
0338       }
0339       if (flags & cudaDeviceMapHost) {
0340         log << "  pinned host memory allocations:  enabled\n";
0341       } else {
0342         log << "  pinned host memory allocations: disabled\n";
0343       }
0344       if (flags & cudaDeviceLmemResizeToMax) {
0345         log << "  kernel host memory reuse:        enabled\n";
0346       } else {
0347         log << "  kernel host memory reuse:       disabled\n";
0348       }
0349       log << '\n';
0350     }
0351 
0352     // set and read the CUDA resource limits.
0353     // see the documentation of cudaDeviceSetLimit() for more information.
0354 
0355     // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
0356     // printf() device system call.
0357     if (printfFifoSize >= 0) {
0358       setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
0359     }
0360     // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
0361     if (stackSize >= 0) {
0362       setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
0363     }
0364     // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0365     // and free() device system calls.
0366     if (mallocHeapSize >= 0) {
0367       setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
0368     }
0369     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0370       // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
0371       // a thread can safely call cudaDeviceSynchronize().
0372       if (devRuntimeSyncDepth >= 0) {
0373         setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0374       }
0375       // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
0376       // device runtime launches that can be made from the current device.
0377       if (devRuntimePendingLaunchCount >= 0) {
0378         setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
0379                      "cudaLimitDevRuntimePendingLaunchCount",
0380                      devRuntimePendingLaunchCount);
0381       }
0382     }
0383 
0384     if (verbose_) {
0385       size_t value;
0386       log << "CUDA limits\n";
0387       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
0388       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
0389       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
0390       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0391       cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
0392       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0393       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0394         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
0395         log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
0396         cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
0397         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
0398       }
0399     }
0400   }
0401 
0402   edm::Service<edm::ResourceInformation> resourceInformationService;
0403   if (resourceInformationService.isAvailable()) {
0404     std::vector<std::string> modelsV(models.begin(), models.end());
0405     resourceInformationService->setGPUModels(modelsV);
0406     std::string nvidiaDriverVersion{systemDriverVersion};
0407     resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0408     resourceInformationService->setCudaDriverVersion(driverVersion);
0409     resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0410   }
0411 
0412   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
0413   if constexpr (cms::cuda::allocator::useCaching) {
0414     cms::cuda::allocator::cachingAllocatorsConstruct();
0415   }
0416   cms::cuda::getEventCache().clear();
0417   cms::cuda::getStreamCache().clear();
0418 
0419   if (verbose_) {
0420     log << '\n' << "CUDAService fully initialized";
0421   }
0422   enabled_ = true;
0423 
0424   // Preallocate buffers if asked to
0425   auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
0426   devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int>>("devicePreallocate"));
0427   hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int>>("hostPreallocate"));
0428 }
0429 
0430 CUDAService::~CUDAService() {
0431   if (enabled_) {
0432     // Explicitly destruct the allocator before the device resets below
0433     if constexpr (cms::cuda::allocator::useCaching) {
0434       cms::cuda::allocator::cachingAllocatorsFreeCached();
0435     }
0436     cms::cuda::getEventCache().clear();
0437     cms::cuda::getStreamCache().clear();
0438 
0439     for (int i = 0; i < numberOfDevices_; ++i) {
0440       cudaCheck(cudaSetDevice(i));
0441       cudaCheck(cudaDeviceSynchronize());
0442       // Explicitly destroys and cleans up all resources associated with the current device in the
0443       // current process. Any subsequent API call to this device will reinitialize the device.
0444       // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
0445       cudaDeviceReset();
0446     }
0447   }
0448 }
0449 
0450 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0451   edm::ParameterSetDescription desc;
0452   desc.addUntracked<bool>("enabled", true);
0453   desc.addUntracked<bool>("verbose", false);
0454 
0455   edm::ParameterSetDescription limits;
0456   limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
0457       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0458   limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0459   limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
0460       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0461   limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
0462       ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
0463   limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
0464       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0465   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0466       ->setComment(
0467           "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0468           "the default value.");
0469 
0470   edm::ParameterSetDescription allocator;
0471   allocator.addUntracked<std::vector<unsigned int>>("devicePreallocate", std::vector<unsigned int>{})
0472       ->setComment("Preallocates buffers of given bytes on all devices");
0473   allocator.addUntracked<std::vector<unsigned int>>("hostPreallocate", std::vector<unsigned int>{})
0474       ->setComment("Preallocates buffers of given bytes on the host");
0475   desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
0476 
0477   descriptions.add("CUDAService", desc);
0478 }
0479 
0480 namespace edm {
0481   namespace service {
0482     inline bool isProcessWideService(CUDAService const*) { return true; }
0483   }  // namespace service
0484 }  // namespace edm
0485 
0486 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0487 using CUDAServiceMaker = edm::serviceregistry::ParameterSetMaker<CUDAInterface, CUDAService>;
0488 DEFINE_FWK_SERVICE_MAKER(CUDAService, CUDAServiceMaker);