Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-03-17 11:05:49

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <string>
0006 #include <vector>
0007 
0008 #include <hip/hip_runtime.h>
0009 /*
0010 #include <nvml.h>
0011 */
0012 
0013 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0014 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0015 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0016 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0017 #include "FWCore/ServiceRegistry/interface/Service.h"
0018 #include "FWCore/Utilities/interface/ResourceInformation.h"
0019 #include "HeterogeneousCore/ROCmServices/interface/ROCmInterface.h"
0020 #include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
0021 /*
0022 #include "HeterogeneousCore/ROCmUtilities/interface/nvmlCheck.h"
0023 */
0024 
0025 class ROCmService : public ROCmInterface {
0026 public:
0027   ROCmService(edm::ParameterSet const& config);
0028   ~ROCmService() override;
0029 
0030   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0031 
0032   bool enabled() const final { return enabled_; }
0033 
0034   int numberOfDevices() const final { return numberOfDevices_; }
0035 
0036   // Return the (major, minor) compute capability of the given device.
0037   std::pair<int, int> computeCapability(int device) const final {
0038     int size = computeCapabilities_.size();
0039     if (device < 0 or device >= size) {
0040       throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0041                               std::to_string(size - 1));
0042     }
0043     return computeCapabilities_[device];
0044   }
0045 
0046 private:
0047   int numberOfDevices_ = 0;
0048   std::vector<std::pair<int, int>> computeCapabilities_;
0049   bool enabled_ = false;
0050   bool verbose_ = false;
0051 };
0052 
0053 void setHipLimit(hipLimit_t limit, const char* name, size_t request) {
0054   // read the current device
0055   int device;
0056   hipCheck(hipGetDevice(&device));
0057   // try to set the requested limit
0058   auto result = hipDeviceSetLimit(limit, request);
0059   if (hipErrorUnsupportedLimit == result) {
0060     edm::LogWarning("ROCmService") << "ROCm device " << device << ": unsupported limit \"" << name << "\"";
0061     return;
0062   }
0063   // read back the limit value
0064   size_t value;
0065   result = hipDeviceGetLimit(&value, limit);
0066   if (hipSuccess != result) {
0067     edm::LogWarning("ROCmService") << "ROCm device " << device << ": failed to set limit \"" << name << "\" to "
0068                                    << request << ", current value is " << value;
0069   } else if (value != request) {
0070     edm::LogWarning("ROCmService") << "ROCm device " << device << ": limit \"" << name << "\" set to " << value
0071                                    << " instead of requested " << request;
0072   }
0073 }
0074 
0075 std::string decodeVersion(int version) {
0076   return std::to_string(version / 1000) + '.' + std::to_string(version % 1000 / 10);
0077 }
0078 
0079 /// Constructor
0080 ROCmService::ROCmService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0081   if (not config.getUntrackedParameter<bool>("enabled")) {
0082     edm::LogInfo("ROCmService") << "ROCmService disabled by configuration";
0083     return;
0084   }
0085 
0086   auto status = hipGetDeviceCount(&numberOfDevices_);
0087   if (hipSuccess != status) {
0088     edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n"
0089                                    << "Disabling the ROCmService.";
0090     return;
0091   }
0092   computeCapabilities_.reserve(numberOfDevices_);
0093 
0094   /*
0095   // AMD system driver version, e.g. 470.57.02
0096   char systemDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE];
0097   nvmlCheck(nvmlInitWithFlags(NVML_INIT_FLAG_NO_GPUS | NVML_INIT_FLAG_NO_ATTACH));
0098   nvmlCheck(nvmlSystemGetDriverVersion(systemDriverVersion, sizeof(systemDriverVersion)));
0099   nvmlCheck(nvmlShutdown());
0100   */
0101 
0102   // ROCm driver version, e.g. 11.4
0103   // the full version, like 11.4.1 or 11.4.100, is not reported
0104   int driverVersion = 0;
0105   hipCheck(hipDriverGetVersion(&driverVersion));
0106 
0107   // ROCm runtime version, e.g. 11.4
0108   // the full version, like 11.4.1 or 11.4.108, is not reported
0109   int runtimeVersion = 0;
0110   hipCheck(hipRuntimeGetVersion(&runtimeVersion));
0111 
0112   edm::LogInfo log("ROCmService");
0113   if (verbose_) {
0114     /*
0115     log << "AMD driver:       " << systemDriverVersion << '\n';
0116     */
0117     log << "ROCm driver API:  " << decodeVersion(driverVersion) << /*" (compiled with " << decodeVersion(ROCm_VERSION)
0118         << ")" */
0119         "\n";
0120     log << "ROCm runtime API: " << decodeVersion(runtimeVersion)
0121         << /*" (compiled with " << decodeVersion(ROCmRT_VERSION)
0122         << ")" */
0123         "\n";
0124     log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0125   } else {
0126     log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0127         << decodeVersion(driverVersion)
0128         /*
0129         << ", AMD driver version " << systemDriverVersion
0130         */
0131         ;
0132   }
0133 
0134   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0135   /*
0136   auto printfFifoSize = limits.getUntrackedParameter<int>("hipLimitPrintfFifoSize");
0137   */
0138   auto stackSize = limits.getUntrackedParameter<int>("hipLimitStackSize");
0139   auto mallocHeapSize = limits.getUntrackedParameter<int>("hipLimitMallocHeapSize");
0140   /*
0141   auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("hipLimitDevRuntimeSyncDepth");
0142   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("hipLimitDevRuntimePendingLaunchCount");
0143   */
0144 
0145   std::set<std::string> models;
0146 
0147   for (int i = 0; i < numberOfDevices_; ++i) {
0148     // read information about the compute device.
0149     // see the documentation of hipGetDeviceProperties() for more information.
0150     hipDeviceProp_t properties;
0151     hipCheck(hipGetDeviceProperties(&properties, i));
0152     log << '\n' << "ROCm device " << i << ": " << properties.name;
0153     if (verbose_) {
0154       log << '\n';
0155     }
0156     models.insert(std::string(properties.name));
0157 
0158     // compute capabilities
0159     computeCapabilities_.emplace_back(properties.major, properties.minor);
0160     if (verbose_) {
0161       log << "  compute capability:          " << properties.major << "." << properties.minor;
0162     }
0163     log << " (sm_" << properties.major << properties.minor << ")";
0164     if (verbose_) {
0165       log << '\n';
0166       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0167       log << "  ROCm cores: " << std::setw(28) << "not yet implemented" << '\n';
0168       /*
0169       log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio
0170           << ":1\n";
0171       */
0172     }
0173 
0174     // compute mode
0175     static constexpr const char* computeModeDescription[] = {
0176         "default (shared)",            // hipComputeModeDefault
0177         "exclusive (single thread)",   // hipComputeModeExclusive
0178         "prohibited",                  // hipComputeModeProhibited
0179         "exclusive (single process)",  // hipComputeModeExclusiveProcess
0180         "unknown"};
0181     if (verbose_) {
0182       log << "  compute mode:" << std::right << std::setw(27)
0183           << computeModeDescription[std::min(properties.computeMode,
0184                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0185           << '\n';
0186     }
0187 
0188     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception
0189     hipCheck(hipSetDevice(i));
0190     hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost));
0191 
0192     // read the free and total amount of memory available for allocation by the device, in bytes.
0193     // see the documentation of hipMemGetInfo() for more information.
0194     if (verbose_) {
0195       size_t freeMemory, totalMemory;
0196       hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
0197       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0198           << totalMemory / (1 << 20) << " MB total\n";
0199       log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
0200       log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
0201     }
0202 
0203     // L1 cache behaviour
0204     if (verbose_) {
0205       /*
0206       static constexpr const char* l1CacheModeDescription[] = {
0207           "unknown", "local memory", "global memory", "local and global memory"};
0208       int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
0209       log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
0210       log << '\n';
0211       */
0212 
0213       log << "Other capabilities\n";
0214       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0215           << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n";
0216       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0217           << " coherently accessing pageable memory without calling hipHostRegister() on it\n";
0218       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0219           << " access pageable memory via the host's page tables\n";
0220       /*
0221       log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
0222           << " access host registered memory at the same virtual address as the host\n";
0223       log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
0224           << " a unified address space with the host\n";
0225       */
0226       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0227           << " allocating managed memory on this system\n";
0228       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0229           << " coherently access managed memory concurrently with the host\n";
0230       log << "  "
0231           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0232           << " directly access managed memory on the device without migration\n";
0233       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0234           << " launching cooperative kernels via hipLaunchCooperativeKernel()\n";
0235       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0236           << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n";
0237       log << '\n';
0238     }
0239 
0240     // set and read the ROCm device flags.
0241     // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for  more information.
0242     if (verbose_) {
0243       log << "ROCm flags\n";
0244       unsigned int flags;
0245       hipCheck(hipGetDeviceFlags(&flags));
0246       switch (flags & hipDeviceScheduleMask) {
0247         case hipDeviceScheduleAuto:
0248           log << "  thread policy:                   default\n";
0249           break;
0250         case hipDeviceScheduleSpin:
0251           log << "  thread policy:                      spin\n";
0252           break;
0253         case hipDeviceScheduleYield:
0254           log << "  thread policy:                     yield\n";
0255           break;
0256         case hipDeviceScheduleBlockingSync:
0257           log << "  thread policy:             blocking sync\n";
0258           break;
0259         default:
0260           log << "  thread policy:                 undefined\n";
0261       }
0262       if (flags & hipDeviceMapHost) {
0263         log << "  pinned host memory allocations:  enabled\n";
0264       } else {
0265         log << "  pinned host memory allocations: disabled\n";
0266       }
0267       if (flags & hipDeviceLmemResizeToMax) {
0268         log << "  kernel host memory reuse:        enabled\n";
0269       } else {
0270         log << "  kernel host memory reuse:       disabled\n";
0271       }
0272       log << '\n';
0273     }
0274 
0275     // set and read the ROCm resource limits.
0276     // see the documentation of hipDeviceSetLimit() for more information.
0277 
0278     /*
0279     // hipLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
0280     // printf() device system call.
0281     if (printfFifoSize >= 0) {
0282       setHipLimit(hipLimitPrintfFifoSize, "hipLimitPrintfFifoSize", printfFifoSize);
0283     }
0284     */
0285     // hipLimitStackSize controls the stack size in bytes of each GPU thread.
0286     if (stackSize >= 0) {
0287       setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize);
0288     }
0289     // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0290     // and free() device system calls.
0291     if (mallocHeapSize >= 0) {
0292       setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize);
0293     }
0294     /*
0295     if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0296       // hipLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
0297       // a thread can safely call hipDeviceSynchronize().
0298       if (devRuntimeSyncDepth >= 0) {
0299         setHipLimit(hipLimitDevRuntimeSyncDepth, "hipLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
0300       }
0301       // hipLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
0302       // device runtime launches that can be made from the current device.
0303       if (devRuntimePendingLaunchCount >= 0) {
0304         setHipLimit(
0305             hipLimitDevRuntimePendingLaunchCount, "hipLimitDevRuntimePendingLaunchCount", devRuntimePendingLaunchCount);
0306       }
0307     }
0308     */
0309 
0310     if (verbose_) {
0311       size_t value;
0312       log << "ROCm limits\n";
0313       /*
0314       hipCheck(hipDeviceGetLimit(&value, hipLimitPrintfFifoSize));
0315       log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
0316       */
0317       hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize));
0318       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0319       hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize));
0320       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0321       /*
0322       if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
0323         hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimeSyncDepth));
0324         log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
0325         hipCheck(hipDeviceGetLimit(&value, hipLimitDevRuntimePendingLaunchCount));
0326         log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
0327       }
0328       */
0329     }
0330   }
0331 
0332   edm::Service<edm::ResourceInformation> resourceInformationService;
0333   if (resourceInformationService.isAvailable()) {
0334     std::vector<std::string> modelsV(models.begin(), models.end());
0335     resourceInformationService->setGPUModels(modelsV);
0336     /*
0337     std::string nvidiaDriverVersion{systemDriverVersion};
0338     resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0339     resourceInformationService->setCudaDriverVersion(driverVersion);
0340     resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0341     */
0342   }
0343 
0344   if (verbose_) {
0345     log << '\n' << "ROCmService fully initialized";
0346   }
0347   enabled_ = true;
0348 }
0349 
0350 ROCmService::~ROCmService() {
0351   if (enabled_) {
0352     for (int i = 0; i < numberOfDevices_; ++i) {
0353       hipCheck(hipSetDevice(i));
0354       hipCheck(hipDeviceSynchronize());
0355       // Explicitly destroys and cleans up all resources associated with the current device in the
0356       // current process. Any subsequent API call to this device will reinitialize the device.
0357       // Useful to check for memory leaks.
0358       hipCheck(hipDeviceReset());
0359     }
0360   }
0361 }
0362 
0363 void ROCmService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0364   edm::ParameterSetDescription desc;
0365   desc.addUntracked<bool>("enabled", true);
0366   desc.addUntracked<bool>("verbose", false);
0367 
0368   edm::ParameterSetDescription limits;
0369   /*
0370   limits.addUntracked<int>("hipLimitPrintfFifoSize", -1)
0371       ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
0372   */
0373   limits.addUntracked<int>("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0374   limits.addUntracked<int>("hipLimitMallocHeapSize", -1)
0375       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0376   limits.addUntracked<int>("hipLimitDevRuntimeSyncDepth", -1)
0377       ->setComment("Maximum nesting depth of a grid at which a thread can safely call hipDeviceSynchronize().");
0378   limits.addUntracked<int>("hipLimitDevRuntimePendingLaunchCount", -1)
0379       ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
0380   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0381       ->setComment(
0382           "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0383           "the default value.");
0384 
0385   descriptions.add("ROCmService", desc);
0386 }
0387 
0388 namespace edm {
0389   namespace service {
0390     inline bool isProcessWideService(ROCmService const*) { return true; }
0391   }  // namespace service
0392 }  // namespace edm
0393 
0394 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0395 using ROCmServiceMaker = edm::serviceregistry::ParameterSetMaker<ROCmInterface, ROCmService>;
0396 DEFINE_FWK_SERVICE_MAKER(ROCmService, ROCmServiceMaker);