AlpakaInterface/interface/CachedBufAlloc.h

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
0003
0004 #include <alpaka/alpaka.hpp>
0005
0006 #include "HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h"
0007 #include "HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h"
0008
0009 namespace cms::alpakatools {
0010
0011   namespace traits {
0012
0013     //! The caching memory allocator trait.
0014     template <typename TElem,
0015               typename TDim,
0016               typename TIdx,
0017               typename TDev,
0018               typename TQueue,
0019               typename = void,
0020               typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0021     struct CachedBufAlloc {
0022       static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
0023     };
0024
0025     //! The caching memory allocator implementation for the CPU device
0026     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0027     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
0028       template <typename TExtent>
0029       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0030                                                 TQueue queue,
0031                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0032         // non-cached, queue-ordered asynchronous host-only memory
0033         return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
0034       }
0035     };
0036
0037 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0038
0039     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0040     template <typename TElem, typename TDim, typename TIdx>
0041     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtBlocking, void> {
0042       template <typename TExtent>
0043       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0044                                                 alpaka::QueueCudaRtBlocking queue,
0045                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0046         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0047
0048         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtBlocking>();
0049
0050         // FIXME the BufCpu does not support a pitch ?
0051         size_t size = alpaka::getExtentProduct(extent);
0052         size_t sizeBytes = size * sizeof(TElem);
0053         void* memPtr = allocator.allocate(sizeBytes, queue);
0054
0055         // use a custom deleter to return the buffer to the CachingAllocator
0056         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0057
0058         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0059       }
0060     };
0061
0062     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0063     template <typename TElem, typename TDim, typename TIdx>
0064     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
0065       template <typename TExtent>
0066       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0067                                                 alpaka::QueueCudaRtNonBlocking queue,
0068                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0069         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0070
0071         auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
0072
0073         // FIXME the BufCpu does not support a pitch ?
0074         size_t size = alpaka::getExtentProduct(extent);
0075         size_t sizeBytes = size * sizeof(TElem);
0076         void* memPtr = allocator.allocate(sizeBytes, queue);
0077
0078         // use a custom deleter to return the buffer to the CachingAllocator
0079         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0080
0081         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0082       }
0083     };
0084
0085     //! The caching memory allocator implementation for the CUDA device
0086     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0087     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
0088       template <typename TExtent>
0089       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev,
0090                                                 TQueue queue,
0091                                                 TExtent const& extent) -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
0092         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0093
0094         auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
0095
0096         size_t width = alpaka::getWidth(extent);
0097         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0098         // TODO implement pitch for TDim > 1
0099         size_t pitchBytes = widthBytes;
0100         size_t size = alpaka::getExtentProduct(extent);
0101         size_t sizeBytes = size * sizeof(TElem);
0102         void* memPtr = allocator.allocate(sizeBytes, queue);
0103
0104         // use a custom deleter to return the buffer to the CachingAllocator
0105         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0106
0107         return alpaka::BufCudaRt<TElem, TDim, TIdx>(
0108             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0109       }
0110     };
0111
0112 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
0113
0114 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0115
0116     //! The caching memory allocator implementation for the pinned host memory, with a blocking queue
0117     template <typename TElem, typename TDim, typename TIdx>
0118     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtBlocking, void> {
0119       template <typename TExtent>
0120       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0121                                                 alpaka::QueueHipRtBlocking queue,
0122                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0123         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0124
0125         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtBlocking>();
0126
0127         // FIXME the BufCpu does not support a pitch ?
0128         size_t size = alpaka::getExtentProduct(extent);
0129         size_t sizeBytes = size * sizeof(TElem);
0130         void* memPtr = allocator.allocate(sizeBytes, queue);
0131
0132         // use a custom deleter to return the buffer to the CachingAllocator
0133         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0134
0135         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0136       }
0137     };
0138
0139     //! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
0140     template <typename TElem, typename TDim, typename TIdx>
0141     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
0142       template <typename TExtent>
0143       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
0144                                                 alpaka::QueueHipRtNonBlocking queue,
0145                                                 TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
0146         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0147
0148         auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
0149
0150         // FIXME the BufCpu does not support a pitch ?
0151         size_t size = alpaka::getExtentProduct(extent);
0152         size_t sizeBytes = size * sizeof(TElem);
0153         void* memPtr = allocator.allocate(sizeBytes, queue);
0154
0155         // use a custom deleter to return the buffer to the CachingAllocator
0156         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0157
0158         return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
0159       }
0160     };
0161
0162     //! The caching memory allocator implementation for the ROCm/HIP device
0163     template <typename TElem, typename TDim, typename TIdx, typename TQueue>
0164     struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
0165       template <typename TExtent>
0166       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev,
0167                                                 TQueue queue,
0168                                                 TExtent const& extent) -> alpaka::BufHipRt<TElem, TDim, TIdx> {
0169         ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
0170
0171         auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
0172
0173         size_t width = alpaka::getWidth(extent);
0174         size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
0175         // TODO implement pitch for TDim > 1
0176         size_t pitchBytes = widthBytes;
0177         size_t size = alpaka::getExtentProduct(extent);
0178         size_t sizeBytes = size * sizeof(TElem);
0179         void* memPtr = allocator.allocate(sizeBytes, queue);
0180
0181         // use a custom deleter to return the buffer to the CachingAllocator
0182         auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
0183
0184         return alpaka::BufHipRt<TElem, TDim, TIdx>(
0185             dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
0186       }
0187     };
0188
0189 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED
0190
0191   }  // namespace traits
0192
0193   template <typename TElem,
0194             typename TIdx,
0195             typename TExtent,
0196             typename TQueue,
0197             typename TDev,
0198             typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
0199   ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
0200     return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
0201   }
0202
0203 }  // namespace cms::alpakatools
0204
0205 #endif  // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h