Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-09-07 04:36:34

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0003 
0004 #include <alpaka/alpaka.hpp>
0005 
0006 #include "HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h"
0007 #include "HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h"
0008 
0009 namespace cms::alpakatools {
0010 
0011   namespace traits {
0012 
0013     //! The caching memory allocator trait.
0014     template <typename TElem,
0015               typename TDim,
0016               typename TIdx,
0017               typename TDev,
0018               typename TQueue,
0019               typename = void,
0020               typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0021     struct CachedBufAlloc {
0022       static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
0023     };
0024 
0025     //! The caching memory allocator implementation for the CPU device
0026     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0027     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
0028       template <typename TExtent>
0029       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0030                                                 TQueue queue,
0031                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0032         // non-cached, queue-ordered asynchronous host-only memory
0033         return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
0034       }
0035     };
0036 
0037 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0038 
0039     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0040     template <typename TElem, typename TDim, typename TIdx>
0041     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtBlocking, void> {
0042       template <typename TExtent>
0043       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0044                                                 alpaka::QueueCudaRtBlocking queue,
0045                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0046         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0047 
0048         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtBlocking>();
0049 
0050         // FIXME the BufCpu does not support a pitch ?
0051         size_t size = alpaka::getExtentProduct(extent);
0052         size_t sizeBytes = size * sizeof(TElem);
0053         void* memPtr = allocator.allocate(sizeBytes, queue);
0054 
0055         // use a custom deleter to return the buffer to the CachingAllocator
0056         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0057 
0058         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0059       }
0060     };
0061 
0062     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0063     template <typename TElem, typename TDim, typename TIdx>
0064     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
0065       template <typename TExtent>
0066       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0067                                                 alpaka::QueueCudaRtNonBlocking queue,
0068                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0069         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0070 
0071         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
0072 
0073         // FIXME the BufCpu does not support a pitch ?
0074         size_t size = alpaka::getExtentProduct(extent);
0075         size_t sizeBytes = size * sizeof(TElem);
0076         void* memPtr = allocator.allocate(sizeBytes, queue);
0077 
0078         // use a custom deleter to return the buffer to the CachingAllocator
0079         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0080 
0081         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0082       }
0083     };
0084 
0085     //! The caching memory allocator implementation for the CUDA device
0086     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0087     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
0088       template <typename TExtent>
0089       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev,
0090                                                 TQueue queue,
0091                                                 TExtent const& extent) -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
0092         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0093 
0094         auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
0095 
0096         size_t width = alpaka::getWidth(extent);
0097         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0098         // TODO implement pitch for TDim > 1
0099         size_t pitchBytes = widthBytes;
0100         size_t size = alpaka::getExtentProduct(extent);
0101         size_t sizeBytes = size * sizeof(TElem);
0102         void* memPtr = allocator.allocate(sizeBytes, queue);
0103 
0104         // use a custom deleter to return the buffer to the CachingAllocator
0105         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0106 
0107         return alpaka::BufCudaRt<TElem, TDim, TIdx>(
0108             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0109       }
0110     };
0111 
0112 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
0113 
0114 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0115 
0116     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0117     template <typename TElem, typename TDim, typename TIdx>
0118     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtBlocking, void> {
0119       template <typename TExtent>
0120       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0121                                                 alpaka::QueueHipRtBlocking queue,
0122                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0123         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0124 
0125         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtBlocking>();
0126 
0127         // FIXME the BufCpu does not support a pitch ?
0128         size_t size = alpaka::getExtentProduct(extent);
0129         size_t sizeBytes = size * sizeof(TElem);
0130         void* memPtr = allocator.allocate(sizeBytes, queue);
0131 
0132         // use a custom deleter to return the buffer to the CachingAllocator
0133         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0134 
0135         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0136       }
0137     };
0138 
0139     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0140     template <typename TElem, typename TDim, typename TIdx>
0141     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
0142       template <typename TExtent>
0143       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0144                                                 alpaka::QueueHipRtNonBlocking queue,
0145                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0146         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0147 
0148         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
0149 
0150         // FIXME the BufCpu does not support a pitch ?
0151         size_t size = alpaka::getExtentProduct(extent);
0152         size_t sizeBytes = size * sizeof(TElem);
0153         void* memPtr = allocator.allocate(sizeBytes, queue);
0154 
0155         // use a custom deleter to return the buffer to the CachingAllocator
0156         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0157 
0158         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0159       }
0160     };
0161 
0162     //! The caching memory allocator implementation for the ROCm/HIP device
0163     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0164     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
0165       template <typename TExtent>
0166       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev,
0167                                                 TQueue queue,
0168                                                 TExtent const& extent) -> alpaka::BufHipRt<TElem, TDim, TIdx> {
0169         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0170 
0171         auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
0172 
0173         size_t width = alpaka::getWidth(extent);
0174         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0175         // TODO implement pitch for TDim > 1
0176         size_t pitchBytes = widthBytes;
0177         size_t size = alpaka::getExtentProduct(extent);
0178         size_t sizeBytes = size * sizeof(TElem);
0179         void* memPtr = allocator.allocate(sizeBytes, queue);
0180 
0181         // use a custom deleter to return the buffer to the CachingAllocator
0182         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0183 
0184         return alpaka::BufHipRt<TElem, TDim, TIdx>(
0185             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0186       }
0187     };
0188 
0189 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
0190 
0191   }  // namespace traits
0192 
0193   template <typename TElem,
0194             typename TIdx,
0195             typename TExtent,
0196             typename TQueue,
0197             typename TDev,
0198             typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0199   ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
0200     return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
0201   }
0202 
0203 }  // namespace cms::alpakatools
0204 
0205 #endif  // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h