File indexing completed on 2024-07-10 02:34:46
0001 #ifndef HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
0002 #define HeterogeneousCore_AlpakaInterface_interface_CachingAllocator_h
0003
0004 #include <cassert>
0005 #include <exception>
0006 #include <iomanip>
0007 #include <iostream>
0008 #include <map>
0009 #include <mutex>
0010 #include <optional>
0011 #include <sstream>
0012 #include <string>
0013 #include <tuple>
0014 #include <type_traits>
0015
0016 #include <alpaka/alpaka.hpp>
0017
0018 #include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
0019 #include "HeterogeneousCore/AlpakaInterface/interface/AllocatorConfig.h"
0020 #include "HeterogeneousCore/AlpakaInterface/interface/AlpakaServiceFwd.h"
0021
0022
0023
0024 namespace cms::alpakatools {
0025
0026 namespace detail {
0027
0028 inline constexpr unsigned int power(unsigned int base, unsigned int exponent) {
0029 unsigned int power = 1;
0030 while (exponent > 0) {
0031 if (exponent & 1) {
0032 power = power * base;
0033 }
0034 base = base * base;
0035 exponent = exponent >> 1;
0036 }
0037 return power;
0038 }
0039
0040
0041 inline std::string as_bytes(size_t value) {
0042 if (value == std::numeric_limits<size_t>::max()) {
0043 return "unlimited";
0044 } else if (value >= (1ul << 40) and value % (1ul << 40) == 0) {
0045 return std::to_string(value >> 40) + " TiB";
0046 } else if (value >= (1ul << 30) and value % (1ul << 30) == 0) {
0047 return std::to_string(value >> 30) + " GiB";
0048 } else if (value >= (1ul << 20) and value % (1ul << 20) == 0) {
0049 return std::to_string(value >> 20) + " MiB";
0050 } else if (value >= (1ul << 10) and value % (1ul << 10) == 0) {
0051 return std::to_string(value >> 10) + " KiB";
0052 } else {
0053 return std::to_string(value) + " B";
0054 }
0055 }
0056
0057 }
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088 template <typename TDev, typename TQueue>
0089 class CachingAllocator {
0090 public:
0091 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
0092 friend class alpaka_cuda_async::AlpakaService;
0093 #endif
0094 #ifdef ALPAKA_ACC_GPU_HIP_ENABLED
0095 friend class alpaka_rocm_async::AlpakaService;
0096 #endif
0097 #ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
0098 friend class alpaka_serial_sync::AlpakaService;
0099 #endif
0100 #ifdef ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
0101 friend class alpaka_tbb_async::AlpakaService;
0102 #endif
0103
0104 using Device = TDev;
0105 using Queue = TQueue;
0106 using Event = alpaka::Event<Queue>;
0107 using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
0108
0109
0110 static_assert(alpaka::isDevice<Device>, "TDev should be an alpaka Device type.");
0111 static_assert(alpaka::isQueue<Queue>, "TQueue should be an alpaka Queue type.");
0112 static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
0113 "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be the "
0114 "host CPU.");
0115
0116 struct CachedBytes {
0117 size_t free = 0;
0118 size_t live = 0;
0119 size_t requested = 0;
0120 };
0121
0122 explicit CachingAllocator(
0123 Device const& device,
0124 AllocatorConfig const& config,
0125 bool reuseSameQueueAllocations,
0126
0127
0128
0129 bool debug = false)
0130 : device_(device),
0131 binGrowth_(config.binGrowth),
0132 minBin_(config.minBin),
0133 maxBin_(config.maxBin),
0134 minBinBytes_(detail::power(binGrowth_, minBin_)),
0135 maxBinBytes_(detail::power(binGrowth_, maxBin_)),
0136 maxCachedBytes_(cacheSize(config.maxCachedBytes, config.maxCachedFraction)),
0137 reuseSameQueueAllocations_(reuseSameQueueAllocations),
0138 debug_(debug),
0139 fillAllocations_(config.fillAllocations),
0140 fillAllocationValue_(config.fillAllocationValue),
0141 fillReallocations_(config.fillReallocations),
0142 fillReallocationValue_(config.fillReallocationValue),
0143 fillDeallocations_(config.fillDeallocations),
0144 fillDeallocationValue_(config.fillDeallocationValue),
0145 fillCaches_(config.fillCaches),
0146 fillCacheValue_(config.fillCacheValue) {
0147 if (debug_) {
0148 std::ostringstream out;
0149 out << "CachingAllocator settings\n"
0150 << " bin growth " << binGrowth_ << "\n"
0151 << " min bin " << minBin_ << "\n"
0152 << " max bin " << maxBin_ << "\n"
0153 << " resulting bins:\n";
0154 for (auto bin = minBin_; bin <= maxBin_; ++bin) {
0155 auto binSize = detail::power(binGrowth_, bin);
0156 out << " " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
0157 }
0158 out << " maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
0159 std::cout << out.str() << std::endl;
0160 }
0161 }
0162
0163 ~CachingAllocator() {
0164 {
0165
0166 std::scoped_lock lock(mutex_);
0167 assert(liveBlocks_.empty());
0168 assert(cachedBytes_.live == 0);
0169 }
0170
0171 freeAllCached();
0172 }
0173
0174
0175 CachedBytes cacheStatus() const {
0176 std::scoped_lock lock(mutex_);
0177 return cachedBytes_;
0178 }
0179
0180
0181
0182
0183
0184
0185
0186
0187 void immediateOrAsyncMemset(Queue queue, Buffer buffer, uint8_t value) {
0188
0189 if (std::is_same_v<Device, alpaka::DevCpu> and not reuseSameQueueAllocations_) {
0190 std::memset(buffer.data(), value, alpaka::getExtentProduct(buffer) * sizeof(alpaka::Elem<Buffer>));
0191 } else {
0192 alpaka::memset(queue, buffer, value);
0193 }
0194 }
0195
0196
0197 void* allocate(size_t bytes, Queue queue) {
0198
0199 BlockDescriptor block;
0200 block.queue = std::move(queue);
0201 block.requested = bytes;
0202 std::tie(block.bin, block.bytes) = findBin(bytes);
0203
0204
0205 if (tryReuseCachedBlock(block)) {
0206
0207 if (fillReallocations_) {
0208 immediateOrAsyncMemset(*block.queue, *block.buffer, fillReallocationValue_);
0209 } else if (fillAllocations_) {
0210 immediateOrAsyncMemset(*block.queue, *block.buffer, fillAllocationValue_);
0211 }
0212 } else {
0213 allocateNewBlock(block);
0214
0215 if (fillAllocations_) {
0216 immediateOrAsyncMemset(*block.queue, *block.buffer, fillAllocationValue_);
0217 }
0218 }
0219
0220 return block.buffer->data();
0221 }
0222
0223
0224 void free(void* ptr) {
0225 std::scoped_lock lock(mutex_);
0226
0227 auto iBlock = liveBlocks_.find(ptr);
0228 if (iBlock == liveBlocks_.end()) {
0229 std::stringstream ss;
0230 ss << "Trying to free a non-live block at " << ptr;
0231 throw std::runtime_error(ss.str());
0232 }
0233
0234 BlockDescriptor block = std::move(iBlock->second);
0235 liveBlocks_.erase(iBlock);
0236 cachedBytes_.live -= block.bytes;
0237 cachedBytes_.requested -= block.requested;
0238
0239 bool recache = (cachedBytes_.free + block.bytes <= maxCachedBytes_);
0240 if (recache) {
0241
0242
0243
0244
0245
0246
0247
0248 try {
0249
0250 if (fillCaches_) {
0251 alpaka::memset(*block.queue, *block.buffer, fillCacheValue_);
0252 } else if (fillDeallocations_) {
0253 alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
0254 }
0255
0256 alpaka::enqueue(*(block.queue), *(block.event));
0257 } catch (std::exception& e) {
0258 if (debug_) {
0259 std::ostringstream out;
0260 out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
0261 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0262 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0263 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
0264 << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
0265 << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
0266 std::cout << out.str() << std::endl;
0267 }
0268 return;
0269 }
0270 cachedBytes_.free += block.bytes;
0271
0272
0273 cachedBlocks_.insert(std::make_pair(block.bin, block));
0274
0275 if (debug_) {
0276 std::ostringstream out;
0277 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes << " bytes at "
0278 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << " , event "
0279 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
0280 << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
0281 << " bytes) outstanding." << std::endl;
0282 std::cout << out.str() << std::endl;
0283 }
0284 } else {
0285
0286
0287
0288
0289
0290
0291 try {
0292
0293 if (fillDeallocations_) {
0294 alpaka::memset(*block.queue, *block.buffer, fillDeallocationValue_);
0295 }
0296 } catch (std::exception& e) {
0297 if (debug_) {
0298 std::ostringstream out;
0299 out << "CachingAllocator::free() caught an alpaka error: " << e.what() << "\n";
0300 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0301 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0302 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
0303 << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
0304 << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
0305 std::cout << out.str() << std::endl;
0306 }
0307 return;
0308 }
0309
0310 if (debug_) {
0311 std::ostringstream out;
0312 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes << " bytes at "
0313 << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() << ", event "
0314 << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() << " available blocks cached ("
0315 << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
0316 << " bytes) outstanding." << std::endl;
0317 std::cout << out.str() << std::endl;
0318 }
0319 }
0320 }
0321
0322 private:
0323 struct BlockDescriptor {
0324 std::optional<Buffer> buffer;
0325 std::optional<Queue> queue;
0326 std::optional<Event> event;
0327 size_t bytes = 0;
0328 size_t requested = 0;
0329 unsigned int bin = 0;
0330
0331
0332 auto device() { return alpaka::getDev(*queue); }
0333 };
0334
0335 private:
0336
0337 size_t cacheSize(size_t maxCachedBytes, double maxCachedFraction) const {
0338
0339 size_t totalMemory = alpaka::getMemBytes(device_);
0340 size_t memoryFraction = static_cast<size_t>(maxCachedFraction * totalMemory);
0341 size_t size = std::numeric_limits<size_t>::max();
0342 if (maxCachedBytes > 0 and maxCachedBytes < size) {
0343 size = maxCachedBytes;
0344 }
0345 if (memoryFraction > 0 and memoryFraction < size) {
0346 size = memoryFraction;
0347 }
0348 return size;
0349 }
0350
0351
0352 std::tuple<unsigned int, size_t> findBin(size_t bytes) const {
0353 if (bytes < minBinBytes_) {
0354 return std::make_tuple(minBin_, minBinBytes_);
0355 }
0356 if (bytes > maxBinBytes_) {
0357 throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
0358 " bytes is too large for the caching detail with maximum bin " +
0359 std::to_string(maxBinBytes_) +
0360 " bytes. You might want to increase the maximum bin size");
0361 }
0362 unsigned int bin = minBin_;
0363 size_t binBytes = minBinBytes_;
0364 while (binBytes < bytes) {
0365 ++bin;
0366 binBytes *= binGrowth_;
0367 }
0368 return std::make_tuple(bin, binBytes);
0369 }
0370
0371 bool tryReuseCachedBlock(BlockDescriptor& block) {
0372 std::scoped_lock lock(mutex_);
0373
0374
0375 const auto [begin, end] = cachedBlocks_.equal_range(block.bin);
0376 for (auto iBlock = begin; iBlock != end; ++iBlock) {
0377 if ((reuseSameQueueAllocations_ and (*block.queue == *(iBlock->second.queue))) or
0378 alpaka::isComplete(*(iBlock->second.event))) {
0379
0380 auto queue = std::move(*(block.queue));
0381
0382 block = iBlock->second;
0383 block.queue = std::move(queue);
0384
0385
0386 if (block.device() != alpaka::getDev(*(block.event))) {
0387 block.event = Event{block.device()};
0388 }
0389
0390
0391
0392 liveBlocks_[block.buffer->data()] = block;
0393
0394
0395 cachedBytes_.free -= block.bytes;
0396 cachedBytes_.live += block.bytes;
0397 cachedBytes_.requested += block.requested;
0398
0399 if (debug_) {
0400 std::ostringstream out;
0401 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
0402 << block.buffer->data() << " (" << block.bytes << " bytes) for queue "
0403 << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
0404 << " (previously associated with queue " << iBlock->second.queue->m_spQueueImpl.get() << " , event "
0405 << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
0406 std::cout << out.str() << std::endl;
0407 }
0408
0409
0410 cachedBlocks_.erase(iBlock);
0411 return true;
0412 }
0413 }
0414
0415 return false;
0416 }
0417
0418 Buffer allocateBuffer(size_t bytes, Queue const& queue) {
0419 if constexpr (std::is_same_v<Device, alpaka::Dev<Queue>>) {
0420
0421 return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
0422 } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
0423
0424 using Platform = alpaka::Platform<alpaka::Dev<Queue>>;
0425 return alpaka::allocMappedBuf<Platform, std::byte, size_t>(device_, platform<Platform>(), bytes);
0426 } else {
0427
0428 static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
0429 "The \"memory device\" type can either be the same as the \"synchronisation device\" type, or be "
0430 "the host CPU.");
0431 }
0432 }
0433
0434 void allocateNewBlock(BlockDescriptor& block) {
0435 try {
0436 block.buffer = allocateBuffer(block.bytes, *block.queue);
0437 } catch (std::runtime_error const& e) {
0438
0439 if (debug_) {
0440 std::ostringstream out;
0441 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " << block.bytes
0442 << " bytes for queue " << block.queue->m_spQueueImpl.get()
0443 << ", retrying after freeing cached allocations" << std::endl;
0444 std::cout << out.str() << std::endl;
0445 }
0446
0447 freeAllCached();
0448
0449
0450 block.buffer = allocateBuffer(block.bytes, *block.queue);
0451 }
0452
0453
0454 block.event = Event{block.device()};
0455
0456 {
0457 std::scoped_lock lock(mutex_);
0458 cachedBytes_.live += block.bytes;
0459 cachedBytes_.requested += block.requested;
0460
0461 liveBlocks_[block.buffer->data()] = block;
0462 }
0463
0464 if (debug_) {
0465 std::ostringstream out;
0466 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
0467 << block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
0468 << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
0469 std::cout << out.str() << std::endl;
0470 }
0471 }
0472
0473 void freeAllCached() {
0474 std::scoped_lock lock(mutex_);
0475
0476 while (not cachedBlocks_.empty()) {
0477 auto iBlock = cachedBlocks_.begin();
0478 cachedBytes_.free -= iBlock->second.bytes;
0479
0480 if (debug_) {
0481 std::ostringstream out;
0482 out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes
0483 << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) << " available blocks cached (" << cachedBytes_.free
0484 << " bytes), " << liveBlocks_.size() << " live blocks (" << cachedBytes_.live << " bytes) outstanding."
0485 << std::endl;
0486 std::cout << out.str() << std::endl;
0487 }
0488
0489 cachedBlocks_.erase(iBlock);
0490 }
0491 }
0492
0493
0494 using CachedBlocks = std::multimap<unsigned int, BlockDescriptor>;
0495
0496 using BusyBlocks = std::map<void*, BlockDescriptor>;
0497
0498 inline static const std::string deviceType_ = alpaka::core::demangled<Device>;
0499
0500 mutable std::mutex mutex_;
0501 Device device_;
0502
0503 CachedBytes cachedBytes_;
0504 CachedBlocks cachedBlocks_;
0505 BusyBlocks liveBlocks_;
0506
0507 const unsigned int binGrowth_;
0508 const unsigned int minBin_;
0509 const unsigned int maxBin_;
0510
0511 const size_t minBinBytes_;
0512 const size_t maxBinBytes_;
0513 const size_t maxCachedBytes_;
0514
0515 const bool reuseSameQueueAllocations_;
0516 const bool debug_;
0517
0518 const bool fillAllocations_;
0519 const uint8_t fillAllocationValue_;
0520 const bool fillReallocations_;
0521 const uint8_t fillReallocationValue_;
0522 const bool fillDeallocations_;
0523 const uint8_t fillDeallocationValue_;
0524 const bool fillCaches_;
0525 const uint8_t fillCacheValue_;
0526 };
0527
0528 }
0529
0530 #endif