AlpakaInterface/interface/CachingAllocator.h

0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
0003
0004 #include <cassert>
0005 #include <exception>
0006 #include <iomanip>
0007 #include <iostream>
0008 #include <map>
0009 #include <mutex>
0010 #include <optional>
0011 #include <sstream>
0012 #include <string>
0013 #include <tuple>
0014 #include <type_traits>
0015
0016 #include <alpaka/alpaka.hpp>
0017
0018 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0019 #include "HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h"
0020 #include "HeterogeneousCore/AlpakaInterface/interface/AlpakaServiceFwd.h"
0021
0022 // Inspired by cub::CachingDeviceAllocator
0023
0024 namespace cms::alpakatools {
0025
0026   namespace detail {
0027
0028     inline constexpr unsigned int power(unsigned int base, unsigned int exponent) {
0029       unsigned int power = 1;
0030       while (exponent > 0) {
0031         if (exponent & 1) {
0032           power = power * base;
0033         }
0034         base = base * base;
0035         exponent = exponent >> 1;
0036       }
0037       return power;
0038     }
0039
0040     // format a memory size in B/KiB/MiB/GiB/TiB
0041     inline std::string as_bytes(size_t value) {
0042       if (value == std::numeric_limits<size_t>::max()) {
0043         return "unlimited";
0044       } else if (value >= (1ul << 40) and value % (1ul << 40) == 0) {
0045         return std::to_string(value >> 40) + " TiB";
0046       } else if (value >= (1ul << 30) and value % (1ul << 30) == 0) {
0047         return std::to_string(value >> 30) + " GiB";
0048       } else if (value >= (1ul << 20) and value % (1ul << 20) == 0) {
0049         return std::to_string(value >> 20) + " MiB";
0050       } else if (value >= (1ul << 10) and value % (1ul << 10) == 0) {
0051         return std::to_string(value >> 10) + " KiB";
0052       } else {
0053         return std::to_string(value) + "   B";
0054       }
0055     }
0056
0057   }  // namespace detail
0058
0059   /*
0060    * The "memory device" identifies the memory space, i.e. the device where the memory is allocated.
0061    * A caching allocator object is associated to a single memory `Device`, set at construction time, and unchanged for
0062    * the lifetime of the allocator.
0063    *
0064    * Each allocation is associated to an event on a queue, that identifies the "synchronisation device" according to
0065    * which the synchronisation occurs.
0066    * The `Event` type depends only on the synchronisation `Device` type.
0067    * The `Queue` type depends on the synchronisation `Device` type and the queue properties, either `Sync` or `Async`.
0068    *
0069    * **Note**: how to handle different queue and event types in a single allocator ?  store and access type-punned
0070    * queues and events ?  or template the internal structures on them, but with a common base class ?
0071    * alpaka does rely on the compile-time type for dispatch.
0072    *
0073    * Common use case #1: accelerator's memory allocations
0074    *   - the "memory device" is the accelerator device (e.g. a GPU);
0075    *   - the "synchronisation device" is the same accelerator device;
0076    *   - the `Queue` type is usually always the same (either `Sync` or `Async`).
0077    *
0078    * Common use case #2: pinned host memory allocations
0079    *    - the "memory device" is the host device (e.g. system memory);
0080    *    - the "synchronisation device" is the accelerator device (e.g. a GPU) whose work queue will access the host;
0081    *      memory (direct memory access from the accelerator, or scheduling `alpaka::memcpy`/`alpaka::memset`), and can
0082    *      be different for each allocation;
0083    *    - the synchronisation `Device` _type_ could potentially be different, but memory pinning is currently tied to
0084    *      the accelerator's platform (CUDA, HIP, etc.), so the device type needs to be fixed to benefit from caching;
0085    *    - the `Queue` type can be either `Sync` _or_ `Async` on any allocation.
0086    */
0087
0088   template <typename TDev, typename TQueue>
0089   class CachingAllocator {
0090   public:
0091 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0092     friend class alpaka_cuda_async::AlpakaService;
0093 #endif
0094 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0095     friend class alpaka_rocm_async::AlpakaService;
0096 #endif
0097 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
0098     friend class alpaka_serial_sync::AlpakaService;
0099 #endif
0100 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
0101     friend class alpaka_tbb_async::AlpakaService;
0102 #endif
0103
0104     using Device = TDev;                 // the "memory device", where the memory will be allocated
0105     using Queue = TQueue;                // the queue used to submit the memory operations
0106     using Event = alpaka::Event<Queue>;  // the events used to synchronise the operations
0107     using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
0108
0109     // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
0110     static_assert(alpaka::isDevice<Device>, "TDev should be an alpaka Device type.");
0111     static_assert(alpaka::isQueue<Queue>, "TQueue should be an alpaka Queue type.");
0112     static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
0113                   "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
0114                   "host CPU.");
0115
0116     struct CachedBytes {
0117       size_t free = 0;       // total bytes freed and cached on this device
0118       size_t live = 0;       // total bytes currently in use oin this device
0119       size_t requested = 0;  // total bytes requested and currently in use on this device
0120     };
0121
0122     explicit CachingAllocator(
0123         Device const& device,
0124         AllocatorConfig const& config,
0125         bool reuseSameQueueAllocations,  // Reuse non-ready allocations if they are in the same queue as the new one;
0126                                          // this is safe only if all memory operations are scheduled in the same queue.
0127                                          // In particular, this is not safe if the memory will be accessed without using
0128                                          // any queue, like host memory accessed directly or with immediate operations.
0129         bool debug = false)
0130         : device_(device),
0131           binGrowth_(config.binGrowth),
0132           minBin_(config.minBin),
0133           maxBin_(config.maxBin),
0134           minBinBytes_(detail::power(binGrowth_, minBin_)),
0135           maxBinBytes_(detail::power(binGrowth_, maxBin_)),
0136           maxCachedBytes_(cacheSize(config.maxCachedBytes, config.maxCachedFraction)),
0137           reuseSameQueueAllocations_(reuseSameQueueAllocations),
0138           debug_(debug),
0139           fillAllocations_(config.fillAllocations),
0140           fillAllocationValue_(config.fillAllocationValue),
0141           fillReallocations_(config.fillReallocations),
0142           fillReallocationValue_(config.fillReallocationValue),
0143           fillDeallocations_(config.fillDeallocations),
0144           fillDeallocationValue_(config.fillDeallocationValue),
0145           fillCaches_(config.fillCaches),
0146           fillCacheValue_(config.fillCacheValue) {
0147       if (debug_) {
0148         std::ostringstream out;
0149         out << "CachingAllocator settings\n"
0150             << "  bin growth " << binGrowth_ << "\n"
0151             << "  min bin    " << minBin_ << "\n"
0152             << "  max bin    " << maxBin_ << "\n"
0153             << "  resulting bins:\n";
0154         for (auto bin = minBin_; bin <= maxBin_; ++bin) {
0155           auto binSize = detail::power(binGrowth_, bin);
0156           out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
0157         }
0158         out << "  maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
0159         std::cout << out.str() << std::endl;
0160       }
0161     }
0162
0163     ~CachingAllocator() {
0164       {
0165         // this should never be called while some memory blocks are still live
0166         std::scoped_lock lock(mutex_);
0167         assert(liveBlocks_.empty());
0168         assert(cachedBytes_.live == 0);
0169       }
0170
0171       freeAllCached();
0172     }
0173
0174     // return a copy of the cache allocation status, for monitoring purposes
0175     CachedBytes cacheStatus() const {
0176       std::scoped_lock lock(mutex_);
0177       return cachedBytes_;
0178     }
0179
0180     // Fill a memory buffer with the specified bye value.
0181     // If the underlying device is the host and the allocator is configured to support immediate
0182     // (non queue-ordered) operations, fill the memory synchronously using std::memset.
0183     // Otherwise, let the alpaka queue schedule the operation.
0184     //
0185     // This is not used for deallocation/caching, because the memory may still be in use until the
0186     // corresponding event is reached.
0187     void immediateOrAsyncMemset(Queue queue, Buffer buffer, uint8_t value) {
0188       // host-only
0189       if (std::is_same_v<Device, alpaka::DevCpu> and not reuseSameQueueAllocations_) {
0190         std::memset(buffer.data(), value, alpaka::getExtentProduct(buffer) * sizeof(alpaka::Elem<Buffer>));
0191       } else {
0192         alpaka::memset(queue, buffer, value);
0193       }
0194     }
0195
0196     // Allocate given number of bytes on the current device associated to given queue
0197     void* allocate(size_t bytes, Queue queue) {
0198       // create a block descriptor for the requested allocation
0199       BlockDescriptor block;
0200       block.queue = std::move(queue);
0201       block.requested = bytes;
0202       std::tie(block.bin, block.bytes) = findBin(bytes);
0203
0204       // try to re-use a cached block, or allocate a new buffer
0205       if (tryReuseCachedBlock(block)) {
0206         // fill the re-used memory block with a pattern
0207         if (fillReallocations_) {
0208           immediateOrAsyncMemset(*block.queue, *block.buffer, fillReallocationValue_);
0209         } else if (fillAllocations_) {
0210           immediateOrAsyncMemset(*block.queue, *block.buffer, fillAllocationValue_);
0211         }
0212       } else {
0213         allocateNewBlock(block);
0214         // fill the newly allocated memory block with a pattern
0215         if (fillAllocations_) {
0216           immediateOrAsyncMemset(*block.queue, *block.buffer, fillAllocationValue_);
0217         }
0218       }
0219
0220       return block.buffer->data();
0221     }
0222
0223     // frees an allocation
0224     void free(void* ptr) {
0225       std::scoped_lock lock(mutex_);
0226
0227       auto iBlock = liveBlocks_.find(ptr);
0228       if (iBlock == liveBlocks_.end()) {
0229         std::stringstream ss;
0230         ss << "Trying to free a non-live block at " << ptr;
0231         throw std::runtime_error(ss.str());
0232       }
0233       // remove the block from the list of live blocks
0234       BlockDescriptor block = std::move(iBlock->second);
0235       liveBlocks_.erase(iBlock);
0236       cachedBytes_.live -= block.bytes;
0237       cachedBytes_.requested -= block.requested;
0238
0239       bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
0240       if (recache) {
0241         // If enqueuing the event fails, very likely an error has
0242         // occurred in the asynchronous processing. In that case the
0243         // error will show up in all device API function calls, and
0244         // the free() will be called by destructors during stack
0245         // unwinding. In order to avoid terminate() being called
0246         // because of multiple exceptions it is best to ignore these
0247         // errors.
0248         try {
0249           // fill memory blocks with a pattern before caching them
0250           if (fillCaches_) {
0251             alpaka::memset(*block.queue, *block.buffer, fillCacheValue_);
0252           } else if (fillDeallocations_) {
0253             alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
0254           }
0255           // record in the block a marker associated to the work queue
0256           alpaka::enqueue(*(block.queue), *(block.event));
0257         } catch (std::exception& e) {
0258           if (debug_) {
0259             std::ostringstream out;
0260             out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
0261             out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0262                 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0263                 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
0264                 << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
0265                 << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
0266             std::cout << out.str() << std::endl;
0267           }
0268           return;
0269         }
0270         cachedBytes_.free += block.bytes;
0271         // after the call to insert(), cachedBlocks_ shares ownership of the buffer
0272         // TODO use std::move ?
0273         cachedBlocks_.insert(std::make_pair(block.bin, block));
0274
0275         if (debug_) {
0276           std::ostringstream out;
0277           out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at "
0278               << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event "
0279               << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
0280               << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
0281               << " bytes) outstanding." << std::endl;
0282           std::cout << out.str() << std::endl;
0283         }
0284       } else {
0285         // If the memset fails, very likely an error has occurred in the
0286         // asynchronous processing. In that case the error will show up in all
0287         // device API function calls, and the free() will be called by
0288         // destructors during stack unwinding. In order to avoid terminate()
0289         // being called because of multiple exceptions it is best to ignore
0290         // these errors.
0291         try {
0292           // fill memory blocks with a pattern before freeing them
0293           if (fillDeallocations_) {
0294             alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
0295           }
0296         } catch (std::exception& e) {
0297           if (debug_) {
0298             std::ostringstream out;
0299             out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
0300             out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0301                 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0302                 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
0303                 << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
0304                 << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
0305             std::cout << out.str() << std::endl;
0306           }
0307           return;
0308         }
0309         // if the buffer is not recached, it is automatically freed when block goes out of scope
0310         if (debug_) {
0311           std::ostringstream out;
0312           out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0313               << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0314               << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
0315               << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
0316               << " bytes) outstanding." << std::endl;
0317           std::cout << out.str() << std::endl;
0318         }
0319       }
0320     }
0321
0322   private:
0323     struct BlockDescriptor {
0324       std::optional<Buffer> buffer;
0325       std::optional<Queue> queue;
0326       std::optional<Event> event;
0327       size_t bytes = 0;
0328       size_t requested = 0;  // for monitoring only
0329       unsigned int bin = 0;
0330
0331       // the "synchronisation device" for this block
0332       auto device() { return alpaka::getDev(*queue); }
0333     };
0334
0335   private:
0336     // return the maximum amount of memory that should be cached on this device
0337     size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const {
0338       // note that getMemBytes() returns 0 if the platform does not support querying the device memory
0339       size_t totalMemory = alpaka::getMemBytes(device_);
0340       size_t memoryFraction = static_cast<size_t>(maxCachedFraction * totalMemory);
0341       size_t size = std::numeric_limits<size_t>::max();
0342       if (maxCachedBytes > 0 and maxCachedBytes < size) {
0343         size = maxCachedBytes;
0344       }
0345       if (memoryFraction > 0 and memoryFraction < size) {
0346         size = memoryFraction;
0347       }
0348       return size;
0349     }
0350
0351     // return (bin, bin size)
0352     std::tuple<unsigned int, size_t> findBin(size_t bytes) const {
0353       if (bytes < minBinBytes_) {
0354         return std::make_tuple(minBin_, minBinBytes_);
0355       }
0356       if (bytes > maxBinBytes_) {
0357         throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
0358                                  " bytes is too large for the caching detail with maximum bin " +
0359                                  std::to_string(maxBinBytes_) +
0360                                  " bytes. You might want to increase the maximum bin size");
0361       }
0362       unsigned int bin = minBin_;
0363       size_t binBytes = minBinBytes_;
0364       while (binBytes < bytes) {
0365         ++bin;
0366         binBytes *= binGrowth_;
0367       }
0368       return std::make_tuple(bin, binBytes);
0369     }
0370
0371     bool tryReuseCachedBlock(BlockDescriptor& block) {
0372       std::scoped_lock lock(mutex_);
0373
0374       // iterate through the range of cached blocks in the same bin
0375       const auto [begin, end] = cachedBlocks_.equal_range(block.bin);
0376       for (auto iBlock = begin; iBlock != end; ++iBlock) {
0377         if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or
0378             alpaka::isComplete(*(iBlock->second.event))) {
0379           // associate the cached buffer to the new queue
0380           auto queue = std::move(*(block.queue));
0381           // TODO cache (or remove) the debug information and use std::move()
0382           block = iBlock->second;
0383           block.queue = std::move(queue);
0384
0385           // if the new queue is on different device than the old event, create a new event
0386           if (block.device() != alpaka::getDev(*(block.event))) {
0387             block.event = Event{block.device()};
0388           }
0389
0390           // insert the cached block into the live blocks
0391           // TODO cache (or remove) the debug information and use std::move()
0392           liveBlocks_[block.buffer->data()] = block;
0393
0394           // update the accounting information
0395           cachedBytes_.free -= block.bytes;
0396           cachedBytes_.live += block.bytes;
0397           cachedBytes_.requested += block.requested;
0398
0399           if (debug_) {
0400             std::ostringstream out;
0401             out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
0402                 << block.buffer->data() << " (" << block.bytes << " bytes) for queue "
0403                 << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
0404                 << " (previously associated with queue " << iBlock->second.queue->m_spQueueImpl.get() << " , event "
0405                 << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
0406             std::cout << out.str() << std::endl;
0407           }
0408
0409           // remove the reused block from the list of cached blocks
0410           cachedBlocks_.erase(iBlock);
0411           return true;
0412         }
0413       }
0414
0415       return false;
0416     }
0417
0418     Buffer allocateBuffer(size_t bytes, Queue const& queue) {
0419       if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
0420         // allocate device memory
0421         return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
0422       } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
0423         // allocate pinned host memory accessible by the queue's platform
0424         using Platform = alpaka::Platform<alpaka::Dev<Queue>>;
0425         return alpaka::allocMappedBuf<std::byte, size_t>(device_, platform<Platform>(), bytes);
0426       } else {
0427         // unsupported combination
0428         static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
0429                       "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
0430                       "the host CPU.");
0431       }
0432     }
0433
0434     void allocateNewBlock(BlockDescriptor& block) {
0435       try {
0436         block.buffer = allocateBuffer(block.bytes, *block.queue);
0437       } catch (std::runtime_error const& e) {
0438         // the allocation attempt failed: free all cached blocks on the device and retry
0439         if (debug_) {
0440           std::ostringstream out;
0441           out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes
0442               << " bytes for queue " << block.queue->m_spQueueImpl.get()
0443               << ", retrying after freeing cached allocations" << std::endl;
0444           std::cout << out.str() << std::endl;
0445         }
0446         // TODO implement a method that frees only up to block.bytes bytes
0447         freeAllCached();
0448
0449         // throw an exception if it fails again
0450         block.buffer = allocateBuffer(block.bytes, *block.queue);
0451       }
0452
0453       // create a new event associated to the "synchronisation device"
0454       block.event = Event{block.device()};
0455
0456       {
0457         std::scoped_lock lock(mutex_);
0458         cachedBytes_.live += block.bytes;
0459         cachedBytes_.requested += block.requested;
0460         // TODO use std::move() ?
0461         liveBlocks_[block.buffer->data()] = block;
0462       }
0463
0464       if (debug_) {
0465         std::ostringstream out;
0466         out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
0467             << block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
0468             << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
0469         std::cout << out.str() << std::endl;
0470       }
0471     }
0472
0473     void freeAllCached() {
0474       std::scoped_lock lock(mutex_);
0475
0476       while (not cachedBlocks_.empty()) {
0477         auto iBlock = cachedBlocks_.begin();
0478         cachedBytes_.free -= iBlock->second.bytes;
0479
0480         if (debug_) {
0481           std::ostringstream out;
0482           out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes
0483               << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free
0484               << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding."
0485               << std::endl;
0486           std::cout << out.str() << std::endl;
0487         }
0488
0489         cachedBlocks_.erase(iBlock);
0490       }
0491     }
0492
0493     // TODO replace with a tbb::concurrent_multimap ?
0494     using CachedBlocks = std::multimap<unsigned int, BlockDescriptor>;  // ordered by the allocation bin
0495     // TODO replace with a tbb::concurrent_map ?
0496     using BusyBlocks = std::map<void*, BlockDescriptor>;  // ordered by the address of the allocated memory
0497
0498     inline static const std::string deviceType_ = alpaka::core::demangled<Device>;
0499
0500     mutable std::mutex mutex_;
0501     Device device_;  // the device where the memory is allocated
0502
0503     CachedBytes cachedBytes_;
0504     CachedBlocks cachedBlocks_;  // Set of cached device allocations available for reuse
0505     BusyBlocks liveBlocks_;      // map of pointers to the live device allocations currently in use
0506
0507     const unsigned int binGrowth_;  // Geometric growth factor for bin-sizes
0508     const unsigned int minBin_;
0509     const unsigned int maxBin_;
0510
0511     const size_t minBinBytes_;
0512     const size_t maxBinBytes_;
0513     const size_t maxCachedBytes_;  // Maximum aggregate cached bytes per device
0514
0515     const bool reuseSameQueueAllocations_;
0516     const bool debug_;
0517
0518     const bool fillAllocations_;
0519     const uint8_t fillAllocationValue_;
0520     const bool fillReallocations_;
0521     const uint8_t fillReallocationValue_;
0522     const bool fillDeallocations_;
0523     const uint8_t fillDeallocationValue_;
0524     const bool fillCaches_;
0525     const uint8_t fillCacheValue_;
0526   };
0527
0528 }  // namespace cms::alpakatools
0529
0530 #endif  // HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h