Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-20 02:31:58

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0003 
0004 #include <alpaka/alpaka.hpp>
0005 
0006 #include "HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h"
0007 #include "HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h"
0008 
0009 namespace cms::alpakatools {
0010 
0011   namespace traits {
0012 
0013     //! The caching memory allocator trait.
0014     template <typename TElem,
0015               typename TDim,
0016               typename TIdx,
0017               typename TDev,
0018               typename TQueue,
0019               typename = void,
0020               typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0021     struct CachedBufAlloc {
0022       static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
0023     };
0024 
0025     //! The caching memory allocator implementation for the CPU device
0026     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0027     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
0028       template <typename TExtent>
0029       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent)
0030           -> alpaka::BufCpu<TElem, TDim, TIdx> {
0031         // non-cached, queue-ordered asynchronous host-only memory
0032         return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
0033       }
0034     };
0035 
0036 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0037 
0038     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0039     template <typename TElem, typename TDim, typename TIdx>
0040     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtBlocking, void> {
0041       template <typename TExtent>
0042       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0043                                                 alpaka::QueueCudaRtBlocking queue,
0044                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0045         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0046 
0047         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtBlocking>();
0048 
0049         // FIXME the BufCpu does not support a pitch ?
0050         size_t size = alpaka::getExtentProduct(extent);
0051         size_t sizeBytes = size * sizeof(TElem);
0052         void* memPtr = allocator.allocate(sizeBytes, queue);
0053 
0054         // use a custom deleter to return the buffer to the CachingAllocator
0055         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0056 
0057         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0058       }
0059     };
0060 
0061     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0062     template <typename TElem, typename TDim, typename TIdx>
0063     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
0064       template <typename TExtent>
0065       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0066                                                 alpaka::QueueCudaRtNonBlocking queue,
0067                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0068         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0069 
0070         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
0071 
0072         // FIXME the BufCpu does not support a pitch ?
0073         size_t size = alpaka::getExtentProduct(extent);
0074         size_t sizeBytes = size * sizeof(TElem);
0075         void* memPtr = allocator.allocate(sizeBytes, queue);
0076 
0077         // use a custom deleter to return the buffer to the CachingAllocator
0078         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0079 
0080         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0081       }
0082     };
0083 
0084     //! The caching memory allocator implementation for the CUDA device
0085     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0086     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
0087       template <typename TExtent>
0088       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
0089           -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
0090         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0091 
0092         auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
0093 
0094         size_t width = alpaka::getWidth(extent);
0095         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0096         // TODO implement pitch for TDim > 1
0097         size_t pitchBytes = widthBytes;
0098         size_t size = alpaka::getExtentProduct(extent);
0099         size_t sizeBytes = size * sizeof(TElem);
0100         void* memPtr = allocator.allocate(sizeBytes, queue);
0101 
0102         // use a custom deleter to return the buffer to the CachingAllocator
0103         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0104 
0105         return alpaka::BufCudaRt<TElem, TDim, TIdx>(
0106             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0107       }
0108     };
0109 
0110 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
0111 
0112 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0113 
0114     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0115     template <typename TElem, typename TDim, typename TIdx>
0116     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtBlocking, void> {
0117       template <typename TExtent>
0118       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0119                                                 alpaka::QueueHipRtBlocking queue,
0120                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0121         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0122 
0123         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtBlocking>();
0124 
0125         // FIXME the BufCpu does not support a pitch ?
0126         size_t size = alpaka::getExtentProduct(extent);
0127         size_t sizeBytes = size * sizeof(TElem);
0128         void* memPtr = allocator.allocate(sizeBytes, queue);
0129 
0130         // use a custom deleter to return the buffer to the CachingAllocator
0131         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0132 
0133         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0134       }
0135     };
0136 
0137     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0138     template <typename TElem, typename TDim, typename TIdx>
0139     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
0140       template <typename TExtent>
0141       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0142                                                 alpaka::QueueHipRtNonBlocking queue,
0143                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0144         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0145 
0146         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
0147 
0148         // FIXME the BufCpu does not support a pitch ?
0149         size_t size = alpaka::getExtentProduct(extent);
0150         size_t sizeBytes = size * sizeof(TElem);
0151         void* memPtr = allocator.allocate(sizeBytes, queue);
0152 
0153         // use a custom deleter to return the buffer to the CachingAllocator
0154         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0155 
0156         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0157       }
0158     };
0159 
0160     //! The caching memory allocator implementation for the ROCm/HIP device
0161     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0162     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
0163       template <typename TExtent>
0164       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
0165           -> alpaka::BufHipRt<TElem, TDim, TIdx> {
0166         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0167 
0168         auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
0169 
0170         size_t width = alpaka::getWidth(extent);
0171         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0172         // TODO implement pitch for TDim > 1
0173         size_t pitchBytes = widthBytes;
0174         size_t size = alpaka::getExtentProduct(extent);
0175         size_t sizeBytes = size * sizeof(TElem);
0176         void* memPtr = allocator.allocate(sizeBytes, queue);
0177 
0178         // use a custom deleter to return the buffer to the CachingAllocator
0179         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0180 
0181         return alpaka::BufHipRt<TElem, TDim, TIdx>(
0182             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0183       }
0184     };
0185 
0186 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
0187 
0188   }  // namespace traits
0189 
0190   template <typename TElem,
0191             typename TIdx,
0192             typename TExtent,
0193             typename TQueue,
0194             typename TDev,
0195             typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0196   ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
0197     return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
0198   }
0199 
0200 }  // namespace cms::alpakatools
0201 
0202 #endif  // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h