File indexing completed on 2024-04-25 02:13:56
0001 #include <iomanip>
0002 #include <iostream>
0003 #include <limits>
0004 #include <set>
0005 #include <string>
0006 #include <vector>
0007
0008 #include <hip/hip_runtime.h>
0009 #if HIP_VERSION_MAJOR >= 6
0010
0011 #include <rocm-core/rocm_version.h>
0012 #else
0013 #include <rocm_version.h>
0014 #endif
0015 #include <rocm_smi/rocm_smi.h>
0016
0017 #include "FWCore/MessageLogger/interface/MessageLogger.h"
0018 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
0019 #include "FWCore/ParameterSet/interface/ParameterSet.h"
0020 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
0021 #include "FWCore/ServiceRegistry/interface/Service.h"
0022 #include "FWCore/Utilities/interface/ResourceInformation.h"
0023 #include "HeterogeneousCore/ROCmServices/interface/ROCmInterface.h"
0024 #include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
0025 #include "HeterogeneousCore/ROCmUtilities/interface/rsmiCheck.h"
0026
0027 class ROCmService : public ROCmInterface {
0028 public:
0029 ROCmService(edm::ParameterSet const& config);
0030 ~ROCmService() override;
0031
0032 static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
0033
0034 bool enabled() const final { return enabled_; }
0035
0036 int numberOfDevices() const final { return numberOfDevices_; }
0037
0038
0039 std::pair<int, int> computeCapability(int device) const final {
0040 int size = computeCapabilities_.size();
0041 if (device < 0 or device >= size) {
0042 throw std::out_of_range("Invalid device index" + std::to_string(device) + ": the valid range is from 0 to " +
0043 std::to_string(size - 1));
0044 }
0045 return computeCapabilities_[device];
0046 }
0047
0048 private:
0049 int numberOfDevices_ = 0;
0050 std::vector<std::pair<int, int>> computeCapabilities_;
0051 bool enabled_ = false;
0052 bool verbose_ = false;
0053 };
0054
0055 void setHipLimit(hipLimit_t limit, const char* name, size_t request) {
0056 #if HIP_VERSION >= 50400000
0057
0058 int device;
0059 hipCheck(hipGetDevice(&device));
0060
0061 auto result = hipDeviceSetLimit(limit, request);
0062 if (hipErrorUnsupportedLimit == result) {
0063 edm::LogWarning("ROCmService") << "ROCm device " << device << ": unsupported limit \"" << name << "\"";
0064 return;
0065 }
0066
0067 size_t value;
0068 result = hipDeviceGetLimit(&value, limit);
0069 if (hipSuccess != result) {
0070 edm::LogWarning("ROCmService") << "ROCm device " << device << ": failed to set limit \"" << name << "\" to "
0071 << request << ", current value is " << value;
0072 } else if (value != request) {
0073 edm::LogWarning("ROCmService") << "ROCm device " << device << ": limit \"" << name << "\" set to " << value
0074 << " instead of requested " << request;
0075 }
0076 #else
0077 edm::LogWarning("ROCmService") << "ROCm versions below 5.4.0 do not support setting device limits.";
0078 #endif
0079 }
0080
0081 std::string decodeVersion(int version) {
0082
0083 return std::to_string(version / 10000000) + '.' + std::to_string(version / 100000 % 100) + '.' +
0084 std::to_string(version % 100000);
0085 }
0086
0087
0088 ROCmService::ROCmService(edm::ParameterSet const& config) : verbose_(config.getUntrackedParameter<bool>("verbose")) {
0089 if (not config.getUntrackedParameter<bool>("enabled")) {
0090 edm::LogInfo("ROCmService") << "ROCmService disabled by configuration";
0091 return;
0092 }
0093
0094 auto status = hipGetDeviceCount(&numberOfDevices_);
0095 if (hipSuccess != status) {
0096 edm::LogWarning("ROCmService") << "Failed to initialize the ROCm runtime.\n"
0097 << "Disabling the ROCmService.";
0098 return;
0099 }
0100 computeCapabilities_.reserve(numberOfDevices_);
0101
0102
0103 char systemDriverVersion[256];
0104 rsmiCheck(rsmi_init(0x00));
0105 rsmiCheck(rsmi_version_str_get(RSMI_SW_COMP_DRIVER, systemDriverVersion, sizeof(systemDriverVersion) - 1));
0106 rsmiCheck(rsmi_shut_down());
0107
0108
0109
0110 int driverVersion = 0;
0111 hipCheck(hipDriverGetVersion(&driverVersion));
0112
0113
0114
0115 int runtimeVersion = 0;
0116 hipCheck(hipRuntimeGetVersion(&runtimeVersion));
0117
0118 edm::LogInfo log("ROCmService");
0119 if (verbose_) {
0120 log << "AMD kernel driver: " << systemDriverVersion << '\n';
0121 log << "ROCm driver API: " << decodeVersion(driverVersion) << " (compiled with ROCm " <<
0122 #ifdef ROCM_BUILD_INFO
0123
0124 ROCM_BUILD_INFO
0125 #else
0126 ROCM_VERSION_MAJOR << '.' << ROCM_VERSION_MINOR << '.' << ROCM_VERSION_PATCH
0127 #endif
0128 << ")\n";
0129 log << "ROCm runtime API: " << decodeVersion(runtimeVersion) << " (compiled with HIP " << HIP_VERSION_MAJOR << '.'
0130 << HIP_VERSION_MINOR << '.' << HIP_VERSION_PATCH << ")\n";
0131 log << "ROCm runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n";
0132 } else {
0133 log << "ROCm runtime version " << decodeVersion(runtimeVersion) << ", driver version "
0134 << decodeVersion(driverVersion) << ", AMD driver version " << systemDriverVersion;
0135 }
0136
0137 #if HIP_VERSION >= 50400000
0138 auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
0139 auto stackSize = limits.getUntrackedParameter<int>("hipLimitStackSize");
0140 auto mallocHeapSize = limits.getUntrackedParameter<int>("hipLimitMallocHeapSize");
0141 #endif
0142
0143 std::set<std::string> models;
0144
0145 for (int i = 0; i < numberOfDevices_; ++i) {
0146
0147
0148 hipDeviceProp_t properties;
0149 hipCheck(hipGetDeviceProperties(&properties, i));
0150 log << '\n' << "ROCm device " << i << ": " << properties.name;
0151 if (verbose_) {
0152 log << '\n';
0153 }
0154 models.insert(std::string(properties.name));
0155
0156
0157 computeCapabilities_.emplace_back(properties.major, properties.minor);
0158 if (verbose_) {
0159 log << " compute capability: " << properties.gcnArchName;
0160 } else {
0161 log << " (" << properties.gcnArchName << ")";
0162 }
0163 if (verbose_) {
0164 log << '\n';
0165 log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
0166 log << " ROCm cores: " << std::setw(28) << "not yet implemented" << '\n';
0167
0168 }
0169
0170
0171 static constexpr const char* computeModeDescription[] = {
0172 "default (shared)",
0173 "exclusive (single thread)",
0174 "prohibited",
0175 "exclusive (single process)",
0176 "unknown"};
0177 if (verbose_) {
0178 log << " compute mode:" << std::right << std::setw(27)
0179 << computeModeDescription[std::min(properties.computeMode,
0180 static_cast<int>(std::size(computeModeDescription)) - 1)]
0181 << '\n';
0182 }
0183
0184
0185 hipCheck(hipSetDevice(i));
0186 hipCheck(hipSetDeviceFlags(hipDeviceScheduleAuto | hipDeviceMapHost));
0187
0188 if (verbose_) {
0189
0190
0191 size_t freeMemory = 0;
0192 size_t totalMemory = 0;
0193 hipCheck(hipMemGetInfo(&freeMemory, &totalMemory));
0194 log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
0195 << totalMemory / (1 << 20) << " MB total\n";
0196 log << " constant memory: " << std::setw(8) << properties.totalConstMem / (1 << 10) << " kB\n";
0197 log << " L2 cache size: " << std::setw(8) << properties.l2CacheSize / (1 << 10) << " kB\n";
0198
0199 log << '\n';
0200
0201
0202 log << "Other capabilities\n";
0203 log << " " << (properties.canMapHostMemory ? "can" : "cannot")
0204 << " map host memory into the ROCm address space for use with hipHostAlloc()/hipHostGetDevicePointer()\n";
0205 log << " " << (properties.pageableMemoryAccess ? "supports" : "does not support")
0206 << " coherently accessing pageable memory without calling hipHostRegister() on it\n";
0207 log << " " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
0208 << " access pageable memory via the host's page tables\n";
0209 log << " " << (properties.managedMemory ? "supports" : "does not support")
0210 << " allocating managed memory on this system\n";
0211 log << " " << (properties.concurrentManagedAccess ? "can" : "cannot")
0212 << " coherently access managed memory concurrently with the host\n";
0213 log << " "
0214 << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
0215 << " directly access managed memory on the device without migration\n";
0216 log << " " << (properties.cooperativeLaunch ? "supports" : "does not support")
0217 << " launching cooperative kernels via hipLaunchCooperativeKernel()\n";
0218 log << " " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
0219 << " launching cooperative kernels via hipLaunchCooperativeKernelMultiDevice()\n";
0220 log << '\n';
0221 }
0222
0223
0224
0225 if (verbose_) {
0226 log << "ROCm flags\n";
0227 unsigned int flags;
0228 hipCheck(hipGetDeviceFlags(&flags));
0229 switch (flags & hipDeviceScheduleMask) {
0230 case hipDeviceScheduleAuto:
0231 log << " thread policy: default\n";
0232 break;
0233 case hipDeviceScheduleSpin:
0234 log << " thread policy: spin\n";
0235 break;
0236 case hipDeviceScheduleYield:
0237 log << " thread policy: yield\n";
0238 break;
0239 case hipDeviceScheduleBlockingSync:
0240 log << " thread policy: blocking sync\n";
0241 break;
0242 default:
0243 log << " thread policy: undefined\n";
0244 }
0245 if (flags & hipDeviceMapHost) {
0246 log << " pinned host memory allocations: enabled\n";
0247 } else {
0248 log << " pinned host memory allocations: disabled\n";
0249 }
0250 if (flags & hipDeviceLmemResizeToMax) {
0251 log << " kernel host memory reuse: enabled\n";
0252 } else {
0253 log << " kernel host memory reuse: disabled\n";
0254 }
0255 log << '\n';
0256 }
0257
0258
0259
0260
0261 #if HIP_VERSION >= 50400000
0262
0263 if (stackSize >= 0) {
0264 setHipLimit(hipLimitStackSize, "hipLimitStackSize", stackSize);
0265 }
0266
0267
0268 if (mallocHeapSize >= 0) {
0269 setHipLimit(hipLimitMallocHeapSize, "hipLimitMallocHeapSize", mallocHeapSize);
0270 }
0271 #endif
0272
0273 if (verbose_) {
0274 size_t value;
0275 log << "ROCm limits\n";
0276 #if HIP_VERSION >= 50400000
0277 hipCheck(hipDeviceGetLimit(&value, hipLimitStackSize));
0278 log << " stack size: " << std::setw(10) << value / (1 << 10) << " kB\n";
0279 #endif
0280 hipCheck(hipDeviceGetLimit(&value, hipLimitMallocHeapSize));
0281 log << " malloc heap size: " << std::setw(10) << value / (1 << 20) << " MB\n";
0282 }
0283 }
0284
0285 edm::Service<edm::ResourceInformation> resourceInformationService;
0286 if (resourceInformationService.isAvailable()) {
0287 std::vector<std::string> modelsV(models.begin(), models.end());
0288 resourceInformationService->setGPUModels(modelsV);
0289
0290
0291
0292
0293
0294
0295 }
0296
0297 if (verbose_) {
0298 log << '\n' << "ROCmService fully initialized";
0299 }
0300 enabled_ = true;
0301 }
0302
0303 ROCmService::~ROCmService() {
0304 if (enabled_) {
0305 for (int i = 0; i < numberOfDevices_; ++i) {
0306 hipCheck(hipSetDevice(i));
0307 hipCheck(hipDeviceSynchronize());
0308
0309
0310
0311 hipCheck(hipDeviceReset());
0312 }
0313 }
0314 }
0315
0316 void ROCmService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
0317 edm::ParameterSetDescription desc;
0318 desc.addUntracked<bool>("enabled", true);
0319 desc.addUntracked<bool>("verbose", false);
0320
0321 #if HIP_VERSION >= 50400000
0322 edm::ParameterSetDescription limits;
0323 limits.addUntracked<int>("hipLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
0324 limits.addUntracked<int>("hipLimitMallocHeapSize", -1)
0325 ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
0326 desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
0327 ->setComment(
0328 "See the documentation of hipDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
0329 "the default value.");
0330 #endif
0331
0332 descriptions.add("ROCmService", desc);
0333 }
0334
0335 namespace edm {
0336 namespace service {
0337 inline bool isProcessWideService(ROCmService const*) { return true; }
0338 }
0339 }
0340
0341 #include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
0342 using ROCmServiceMaker = edm::serviceregistry::ParameterSetMaker<ROCmInterface, ROCmService>;
0343 DEFINE_FWK_SERVICE_MAKER(ROCmService, ROCmServiceMaker);