Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-25 02:13:56

0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <string>
0006 #include <vector>
0007 
0008 #include <hip/hip_runtime.h>
0009 #if HIP_VERSION_MAJOR >= 6
0010 // the location of rocm_version.h changed in HIP/ROCm 6.0
0011 #include <rocm-core/rocm_version.h>
0012 #else
0013 #include <rocm_version.h>
0014 #endif  // HIP_VERSION_MAJOR
0015 #include <rocm_smi/rocm_smi.h>
0016 
0017 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0018 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0019 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0020 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0021 #include "FWCore/ServiceRegistry/interface/Service.h"
0022 #include "FWCore/Utilities/interface/ResourceInformation.h"
0023 #include "HeterogeneousCore/ROCmServices/interface/ROCmInterface.h"
0024 #include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
0025 #include "HeterogeneousCore/ROCmUtilities/interface/rsmiCheck.h"
0026 
0027 class ROCmService : public ROCmInterface {
0028 public:
0029   ROCmService(edm::ParameterSet const& config);
0030   ~ROCmService() override;
0031 
0032   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0033 
0034   bool enabled() const final { return enabled_; }
0035 
0036   int numberOfDevices() const final { return numberOfDevices_; }
0037 
0038   // Return the (major, minor) compute capability of the given device.
0039   std::pair<int, int> computeCapability(int device) const final {
0040     int size = computeCapabilities_.size();
0041     if (device < 0 or device >= size) {
0042       throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0043                               std::to_string(size - 1));
0044     }
0045     return computeCapabilities_[device];
0046   }
0047 
0048 private:
0049   int numberOfDevices_ = 0;
0050   std::vector<std::pair<int, int>> computeCapabilities_;
0051   bool enabled_ = false;
0052   bool verbose_ = false;
0053 };
0054 
0055 void setHipLimit(hipLimit_t limit, const char* name, size_t request) {
0056 #if HIP_VERSION >= 50400000
0057   // read the current device
0058   int device;
0059   hipCheck(hipGetDevice(&device));
0060   // try to set the requested limit
0061   auto result = hipDeviceSetLimit(limit, request);
0062   if (hipErrorUnsupportedLimit == result) {
0063     edm::LogWarning("ROCmService") << "ROCm device " << device << ": unsupported limit \"" << name << "\"";
0064     return;
0065   }
0066   // read back the limit value
0067   size_t value;
0068   result = hipDeviceGetLimit(&value, limit);
0069   if (hipSuccess != result) {
0070     edm::LogWarning("ROCmService") << "ROCm device " << device << ": failed to set limit \"" << name << "\" to "
0071                                    << request << ", current value is " << value;
0072   } else if (value != request) {
0073     edm::LogWarning("ROCmService") << "ROCm device " << device << ": limit \"" << name << "\" set to " << value
0074                                    << " instead of requested " << request;
0075   }
0076 #else
0077   edm::LogWarning("ROCmService") << "ROCm versions below 5.4.0 do not support setting device limits.";
0078 #endif
0079 }
0080 
0081 std::string decodeVersion(int version) {
0082   // decode 50631061 as 5.6.31061
0083   return std::to_string(version / 10000000) + '.' + std::to_string(version / 100000 % 100) + '.' +
0084          std::to_string(version % 100000);
0085 }
0086 
0087 /// Constructor
0088 ROCmService::ROCmService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0089   if (not config.getUntrackedParameter<bool>("enabled")) {
0090     edm::LogInfo("ROCmService") << "ROCmService disabled by configuration";
0091     return;
0092   }
0093 
0094   auto status = hipGetDeviceCount(&numberOfDevices_);
0095   if (hipSuccess != status) {
0096     edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n"
0097                                    << "Disabling the ROCmService.";
0098     return;
0099   }
0100   computeCapabilities_.reserve(numberOfDevices_);
0101 
0102   // AMD system driver version, e.g. 5.16.9.22.20 or 6.1.5
0103   char systemDriverVersion[256];
0104   rsmiCheck(rsmi_init(0x00));
0105   rsmiCheck(rsmi_version_str_get(RSMI_SW_COMP_DRIVER, systemDriverVersion, sizeof(systemDriverVersion) - 1));
0106   rsmiCheck(rsmi_shut_down());
0107 
0108   // ROCm driver version, e.g. 11.4
0109   // the full version, like 11.4.1 or 11.4.100, is not reported
0110   int driverVersion = 0;
0111   hipCheck(hipDriverGetVersion(&driverVersion));
0112 
0113   // ROCm runtime version, e.g. 11.4
0114   // the full version, like 11.4.1 or 11.4.108, is not reported
0115   int runtimeVersion = 0;
0116   hipCheck(hipRuntimeGetVersion(&runtimeVersion));
0117 
0118   edm::LogInfo log("ROCmService");
0119   if (verbose_) {
0120     log << "AMD kernel driver: " << systemDriverVersion << '\n';
0121     log << "ROCm driver API:   " << decodeVersion(driverVersion) << " (compiled with ROCm " <<
0122 #ifdef ROCM_BUILD_INFO
0123         // ROCM_BUILD_INFO has been introduced in ROCm 5.5.0
0124         ROCM_BUILD_INFO
0125 #else
0126         ROCM_VERSION_MAJOR << '.' << ROCM_VERSION_MINOR << '.' << ROCM_VERSION_PATCH
0127 #endif
0128         << ")\n";
0129     log << "ROCm runtime API:  " << decodeVersion(runtimeVersion) << " (compiled with HIP " << HIP_VERSION_MAJOR << '.'
0130         << HIP_VERSION_MINOR << '.' << HIP_VERSION_PATCH << ")\n";
0131     log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0132   } else {
0133     log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0134         << decodeVersion(driverVersion) << ", AMD driver version " << systemDriverVersion;
0135   }
0136 
0137 #if HIP_VERSION >= 50400000
0138   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0139   auto stackSize = limits.getUntrackedParameter<int>("hipLimitStackSize");
0140   auto mallocHeapSize = limits.getUntrackedParameter<int>("hipLimitMallocHeapSize");
0141 #endif
0142 
0143   std::set<std::string> models;
0144 
0145   for (int i = 0; i < numberOfDevices_; ++i) {
0146     // read information about the compute device.
0147     // see the documentation of hipGetDeviceProperties() for more information.
0148     hipDeviceProp_t properties;
0149     hipCheck(hipGetDeviceProperties(&properties, i));
0150     log << '\n' << "ROCm device " << i << ": " << properties.name;
0151     if (verbose_) {
0152       log << '\n';
0153     }
0154     models.insert(std::string(properties.name));
0155 
0156     // compute capabilities
0157     computeCapabilities_.emplace_back(properties.major, properties.minor);
0158     if (verbose_) {
0159       log << "  compute capability:          " << properties.gcnArchName;
0160     } else {
0161       log << " (" << properties.gcnArchName << ")";
0162     }
0163     if (verbose_) {
0164       log << '\n';
0165       log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0166       log << "  ROCm cores: " << std::setw(28) << "not yet implemented" << '\n';
0167       // ROCm does not provide single to double performance ratio
0168     }
0169 
0170     // compute mode
0171     static constexpr const char* computeModeDescription[] = {
0172         "default (shared)",            // hipComputeModeDefault
0173         "exclusive (single thread)",   // hipComputeModeExclusive
0174         "prohibited",                  // hipComputeModeProhibited
0175         "exclusive (single process)",  // hipComputeModeExclusiveProcess
0176         "unknown"};
0177     if (verbose_) {
0178       log << "  compute mode:" << std::right << std::setw(27)
0179           << computeModeDescription[std::min(properties.computeMode,
0180                                              static_cast<int>(std::size(computeModeDescription)) - 1)]
0181           << '\n';
0182     }
0183 
0184     // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with an exception
0185     hipCheck(hipSetDevice(i));
0186     hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost));
0187 
0188     if (verbose_) {
0189       // read the free and total amount of memory available for allocation by the device, in bytes.
0190       // see the documentation of hipMemGetInfo() for more information.
0191       size_t freeMemory = 0;
0192       size_t totalMemory = 0;
0193       hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
0194       log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0195           << totalMemory / (1 << 20) << " MB total\n";
0196       log << "  constant memory:             " << std::setw(8) << properties.totalConstMem / (1 << 10) << " kB\n";
0197       log << "  L2 cache size:               " << std::setw(8) << properties.l2CacheSize / (1 << 10) << " kB\n";
0198 
0199       log << '\n';
0200 
0201       // other capabilities
0202       log << "Other capabilities\n";
0203       log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
0204           << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n";
0205       log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0206           << " coherently accessing pageable memory without calling hipHostRegister() on it\n";
0207       log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0208           << " access pageable memory via the host's page tables\n";
0209       log << "  " << (properties.managedMemory ? "supports" : "does not support")
0210           << " allocating managed memory on this system\n";
0211       log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
0212           << " coherently access managed memory concurrently with the host\n";
0213       log << "  "
0214           << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0215           << " directly access managed memory on the device without migration\n";
0216       log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
0217           << " launching cooperative kernels via hipLaunchCooperativeKernel()\n";
0218       log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0219           << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n";
0220       log << '\n';
0221     }
0222 
0223     // set and read the ROCm device flags.
0224     // see the documentation of hipSetDeviceFlags and hipGetDeviceFlags for  more information.
0225     if (verbose_) {
0226       log << "ROCm flags\n";
0227       unsigned int flags;
0228       hipCheck(hipGetDeviceFlags(&flags));
0229       switch (flags & hipDeviceScheduleMask) {
0230         case hipDeviceScheduleAuto:
0231           log << "  thread policy:                   default\n";
0232           break;
0233         case hipDeviceScheduleSpin:
0234           log << "  thread policy:                      spin\n";
0235           break;
0236         case hipDeviceScheduleYield:
0237           log << "  thread policy:                     yield\n";
0238           break;
0239         case hipDeviceScheduleBlockingSync:
0240           log << "  thread policy:             blocking sync\n";
0241           break;
0242         default:
0243           log << "  thread policy:                 undefined\n";
0244       }
0245       if (flags & hipDeviceMapHost) {
0246         log << "  pinned host memory allocations:  enabled\n";
0247       } else {
0248         log << "  pinned host memory allocations: disabled\n";
0249       }
0250       if (flags & hipDeviceLmemResizeToMax) {
0251         log << "  kernel host memory reuse:        enabled\n";
0252       } else {
0253         log << "  kernel host memory reuse:       disabled\n";
0254       }
0255       log << '\n';
0256     }
0257 
0258     // set and read the ROCm resource limits.
0259     // see the documentation of hipDeviceSetLimit() for more information.
0260 
0261 #if HIP_VERSION >= 50400000
0262     // hipLimitStackSize controls the stack size in bytes of each GPU thread.
0263     if (stackSize >= 0) {
0264       setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize);
0265     }
0266     // hipLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
0267     // and free() device system calls.
0268     if (mallocHeapSize >= 0) {
0269       setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize);
0270     }
0271 #endif
0272 
0273     if (verbose_) {
0274       size_t value;
0275       log << "ROCm limits\n";
0276 #if HIP_VERSION >= 50400000
0277       hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize));
0278       log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
0279 #endif
0280       hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize));
0281       log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
0282     }
0283   }
0284 
0285   edm::Service<edm::ResourceInformation> resourceInformationService;
0286   if (resourceInformationService.isAvailable()) {
0287     std::vector<std::string> modelsV(models.begin(), models.end());
0288     resourceInformationService->setGPUModels(modelsV);
0289     /*
0290     std::string nvidiaDriverVersion{systemDriverVersion};
0291     resourceInformationService->setNvidiaDriverVersion(nvidiaDriverVersion);
0292     resourceInformationService->setCudaDriverVersion(driverVersion);
0293     resourceInformationService->setCudaRuntimeVersion(runtimeVersion);
0294     */
0295   }
0296 
0297   if (verbose_) {
0298     log << '\n' << "ROCmService fully initialized";
0299   }
0300   enabled_ = true;
0301 }
0302 
0303 ROCmService::~ROCmService() {
0304   if (enabled_) {
0305     for (int i = 0; i < numberOfDevices_; ++i) {
0306       hipCheck(hipSetDevice(i));
0307       hipCheck(hipDeviceSynchronize());
0308       // Explicitly destroys and cleans up all resources associated with the current device in the
0309       // current process. Any subsequent API call to this device will reinitialize the device.
0310       // Useful to check for memory leaks.
0311       hipCheck(hipDeviceReset());
0312     }
0313   }
0314 }
0315 
0316 void ROCmService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0317   edm::ParameterSetDescription desc;
0318   desc.addUntracked<bool>("enabled", true);
0319   desc.addUntracked<bool>("verbose", false);
0320 
0321 #if HIP_VERSION >= 50400000
0322   edm::ParameterSetDescription limits;
0323   limits.addUntracked<int>("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0324   limits.addUntracked<int>("hipLimitMallocHeapSize", -1)
0325       ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0326   desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0327       ->setComment(
0328           "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0329           "the default value.");
0330 #endif
0331 
0332   descriptions.add("ROCmService", desc);
0333 }
0334 
0335 namespace edm {
0336   namespace service {
0337     inline bool isProcessWideService(ROCmService const*) { return true; }
0338   }  // namespace service
0339 }  // namespace edm
0340 
0341 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0342 using ROCmServiceMaker = edm::serviceregistry::ParameterSetMaker<ROCmInterface, ROCmService>;
0343 DEFINE_FWK_SERVICE_MAKER(ROCmService, ROCmServiceMaker);