AlpakaCore/python/ProcessAcceleratorAlpaka.py

0001 import FWCore.ParameterSet.Config as cms
0002
0003 import os
0004
0005 from HeterogeneousCore.Common.PlatformStatus import PlatformStatus
0006
0007 class ModuleTypeResolverAlpaka:
0008     def __init__(self, accelerators, backend, synchronize):
0009         # first element is used as the default if nothing is set
0010         self._valid_backends = []
0011         if "gpu-nvidia" in accelerators:
0012             self._valid_backends.append("cuda_async")
0013         if "gpu-amd" in accelerators:
0014             self._valid_backends.append("rocm_async")
0015         if "cpu" in accelerators:
0016             self._valid_backends.append("serial_sync")
0017         if len(self._valid_backends) == 0:
0018             raise cms.EDMException(cms.edm.errors.UnavailableAccelerator, "ModuleTypeResolverAlpaka had no backends available because of the combination of the job configuration and accelerator availability of on the machine. The job sees {} accelerators.".format(", ".join(accelerators)))
0019         if backend is not None:
0020             if not backend in self._valid_backends:
0021                 raise cms.EDMException(cms.edm.errors.UnavailableAccelerator, "The ProcessAcceleratorAlpaka was configured to use {} backend, but that backend is not available because of the combination of the job configuration and accelerator availability on the machine. The job was configured to use {} accelerators, which translates to {} Alpaka backends.".format(
0022                     backend, ", ".join(accelerators), ", ".join(self._valid_backends)))
0023             if backend != self._valid_backends[0]:
0024                 self._valid_backends.remove(backend)
0025                 self._valid_backends.insert(0, backend)
0026         self._synchronize = synchronize
0027
0028     def plugin(self):
0029         return "ModuleTypeResolverAlpaka"
0030
0031     def setModuleVariant(self, module):
0032         if module.type_().endswith("@alpaka"):
0033             defaultBackend = self._valid_backends[0]
0034             if hasattr(module, "alpaka"):
0035                 # Ensure the untrackedness already here, because the
0036                 # C++ ModuleTypeResolverAlpaka relies on the
0037                 # untrackedness (before the configuration validation)
0038                 if module.alpaka.isTracked():
0039                     raise cms.EDMException(cms.edm.errors.Configuration, "The 'alpaka' PSet in module '{}' is tracked, but it should be untracked".format(module.label()))
0040                 if hasattr(module.alpaka, "backend"):
0041                     if module.alpaka.backend == "":
0042                         module.alpaka.backend = defaultBackend
0043                     elif module.alpaka.backend.value() not in self._valid_backends:
0044                         raise cms.EDMException(cms.edm.errors.UnavailableAccelerator, "Module {} has the Alpaka backend set explicitly, but its accelerator is not available for the job because of the combination of the job configuration and accelerator availability on the machine. The following Alpaka backends are available for the job {}.".format(module.label_(), ", ".join(self._valid_backends)))
0045                 else:
0046                     module.alpaka.backend = cms.untracked.string(defaultBackend)
0047             else:
0048                 module.alpaka = cms.untracked.PSet(
0049                     backend = cms.untracked.string(defaultBackend)
0050                 )
0051             isDefaultValue = lambda v: \
0052                 isinstance(v, type(cms.optional.untracked.bool)) \
0053                 and not v.isTracked() \
0054                 and v.isCompatibleCMSType(cms.bool)
0055             if not hasattr(module.alpaka, "synchronize") or isDefaultValue(module.alpaka.synchronize):
0056                 module.alpaka.synchronize = cms.untracked.bool(self._synchronize)
0057
0058 class ProcessAcceleratorAlpaka(cms.ProcessAccelerator):
0059     """ProcessAcceleratorAlpaka itself does not define or inspect
0060     availability of any accelerator devices. It merely sets up
0061     necessary Alpaka infrastructure based on the availability of
0062     accelerators that the concrete ProcessAccelerators (like
0063     ProcessAcceleratorCUDA) define.
0064     """
0065     def __init__(self):
0066         super(ProcessAcceleratorAlpaka, self).__init__()
0067         self._backend = None
0068         self._synchronize = False
0069
0070     # User-facing interface
0071     def setBackend(self, backend):
0072         self._backend = backend
0073
0074     def setSynchronize(self, synchronize):
0075         self._synchronize = synchronize
0076
0077     # Framework-facing interface
0078     def moduleTypeResolver(self, accelerators):
0079         return ModuleTypeResolverAlpaka(accelerators, self._backend, self._synchronize)
0080
0081     def apply(self, process, accelerators):
0082         # Propagate the AlpakaService messages through the MessageLogger
0083         if not hasattr(process.MessageLogger, "AlpakaService"):
0084             process.MessageLogger.AlpakaService = cms.untracked.PSet()
0085
0086         # The CPU backend is effectively always available, ensure the AlpakaServiceSerialSync is loaded
0087         if not hasattr(process, "AlpakaServiceSerialSync"):
0088             from HeterogeneousCore.AlpakaServices.AlpakaServiceSerialSync_cfi import AlpakaServiceSerialSync
0089             process.add_(AlpakaServiceSerialSync)
0090
0091         # Check if CUDA is available, and if the system has at least one usable NVIDIA GPU
0092         try:
0093             if not "gpu-nvidia" in accelerators:
0094                 raise False
0095             from HeterogeneousCore.AlpakaServices.AlpakaServiceCudaAsync_cfi import AlpakaServiceCudaAsync
0096         except:
0097             # CUDA is not available, do not load the AlpakaServiceCudaAsync
0098             if hasattr(process, "AlpakaServiceCudaAsync"):
0099                 del process.AlpakaServiceCudaAsync
0100         else:
0101             # CUDA is available, ensure the AlpakaServiceCudaAsync is loaded
0102             if not hasattr(process, "AlpakaServiceCudaAsync"):
0103                 process.add_(AlpakaServiceCudaAsync)
0104
0105         # Check if ROCm is available, and if the system has at least one usable AMD GPU
0106         try:
0107             if not "gpu-amd" in accelerators:
0108                 raise False
0109             from HeterogeneousCore.AlpakaServices.AlpakaServiceROCmAsync_cfi import AlpakaServiceROCmAsync
0110         except:
0111             # ROCm is not available, do not load the AlpakaServiceROCmAsync
0112             if hasattr(process, "AlpakaServiceROCmAsync"):
0113                 del process.AlpakaServiceROCmAsync
0114         else:
0115             # ROCm is available, ensure the AlpakaServiceROCmAsync is loaded
0116             if not hasattr(process, "AlpakaServiceROCmAsync"):
0117                 process.add_(AlpakaServiceROCmAsync)
0118
0119
0120 # Ensure this module is kept in the configuration when dumping it
0121 cms.specialImportRegistry.registerSpecialImportForType(ProcessAcceleratorAlpaka, "from HeterogeneousCore.AlpakaCore.ProcessAcceleratorAlpaka import ProcessAcceleratorAlpaka")